{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 794.265625, "epoch": 0.0004, "grad_norm": 1.0286895913431777, "kl": 0.0, "learning_rate": 8e-08, "loss": 0.001, "reward": 0.435546875, "reward_std": 0.5123052969574928, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.125, "rewards/tag_count_reward": 0.279296875, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 874.1171875, "epoch": 0.0008, "grad_norm": 0.5575479603207912, "kl": 0.0, "learning_rate": 1.6e-07, "loss": 0.0317, "reward": 0.5703125, "reward_std": 0.406142421066761, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.2109375, "rewards/tag_count_reward": 0.359375, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 783.3828125, "epoch": 0.0012, "grad_norm": 0.7607369099531464, "kl": 0.00012254714965820312, "learning_rate": 2.4000000000000003e-07, "loss": -0.0018, "reward": 0.69140625, "reward_std": 0.6304283142089844, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.234375, "rewards/tag_count_reward": 0.35546875, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 815.71875, "epoch": 0.0016, "grad_norm": 0.6096461025819149, "kl": 0.00010502338409423828, "learning_rate": 3.2e-07, "loss": 0.006, "reward": 0.5859375, "reward_std": 0.4485991410911083, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.2265625, "rewards/tag_count_reward": 0.328125, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 816.2109375, "epoch": 0.002, "grad_norm": 0.890763717281587, "kl": 0.00016641616821289062, "learning_rate": 4.0000000000000003e-07, "loss": 0.04, "reward": 0.67578125, "reward_std": 0.5456436574459076, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.1875, "rewards/tag_count_reward": 0.31640625, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 765.578125, "epoch": 0.0024, "grad_norm": 0.9086967335856038, "kl": 0.00012731552124023438, "learning_rate": 4.800000000000001e-07, "loss": 0.0543, "reward": 0.94140625, "reward_std": 0.714580699801445, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.390625, "rewards/tag_count_reward": 0.51171875, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 876.171875, "epoch": 0.0028, "grad_norm": 1.052639038202758, "kl": 0.00026416778564453125, "learning_rate": 5.6e-07, "loss": 0.0234, "reward": 0.517578125, "reward_std": 0.6703888922929764, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.171875, "rewards/tag_count_reward": 0.337890625, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 790.96875, "epoch": 0.0032, "grad_norm": 0.74340367574203, "kl": 0.00024509429931640625, "learning_rate": 6.4e-07, "loss": 0.0097, "reward": 0.65234375, "reward_std": 0.5865648165345192, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.21875, "rewards/tag_count_reward": 0.40234375, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 785.1875, "epoch": 0.0036, "grad_norm": 0.938668221562659, "kl": 0.0008196830749511719, "learning_rate": 7.2e-07, "loss": 0.0057, "reward": 0.650390625, "reward_std": 0.6973848044872284, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.2265625, "rewards/tag_count_reward": 0.376953125, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 804.7421875, "epoch": 0.004, "grad_norm": 0.5211198183675853, "kl": 0.004367828369140625, "learning_rate": 8.000000000000001e-07, "loss": 0.0383, "reward": 0.845703125, "reward_std": 0.7800817787647247, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.2734375, "rewards/tag_count_reward": 0.501953125, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 632.1640625, "epoch": 0.0044, "grad_norm": 0.9876390799388698, "kl": 0.010082244873046875, "learning_rate": 8.8e-07, "loss": 0.001, "reward": 1.365234375, "reward_std": 0.5490432158112526, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.4375, "rewards/tag_count_reward": 0.654296875, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 932.5703125, "epoch": 0.0048, "grad_norm": 2.398919428315798, "kl": 0.021348953247070312, "learning_rate": 9.600000000000001e-07, "loss": 0.0598, "reward": 0.763671875, "reward_std": 0.7361755445599556, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.25, "rewards/tag_count_reward": 0.435546875, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 833.1640625, "epoch": 0.0052, "grad_norm": 45.15918341946321, "kl": 0.4045391082763672, "learning_rate": 1.04e-06, "loss": 0.0848, "reward": 1.1171875, "reward_std": 0.6872416734695435, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.3984375, "rewards/tag_count_reward": 0.5703125, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 813.8203125, "epoch": 0.0056, "grad_norm": 54.56101538089529, "kl": 0.44304943084716797, "learning_rate": 1.12e-06, "loss": 0.0469, "reward": 1.26953125, "reward_std": 0.6652578935027122, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.5, "rewards/tag_count_reward": 0.72265625, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 844.3671875, "epoch": 0.006, "grad_norm": 30.60509465264487, "kl": 0.2570457458496094, "learning_rate": 1.2000000000000002e-06, "loss": 0.0682, "reward": 1.01171875, "reward_std": 0.7625316083431244, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.390625, "rewards/tag_count_reward": 0.58203125, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 775.71875, "epoch": 0.0064, "grad_norm": 12.912503118850111, "kl": 0.1331787109375, "learning_rate": 1.28e-06, "loss": 0.0304, "reward": 0.970703125, "reward_std": 0.7644316107034683, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.3515625, "rewards/tag_count_reward": 0.525390625, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 839.6015625, "epoch": 0.0068, "grad_norm": 1.2760626759008138, "kl": 0.01568603515625, "learning_rate": 1.3600000000000001e-06, "loss": 0.0309, "reward": 1.1171875, "reward_std": 0.6193623393774033, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.4375, "rewards/tag_count_reward": 0.6484375, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 764.0859375, "epoch": 0.0072, "grad_norm": 1.265891036417497, "kl": 0.018768310546875, "learning_rate": 1.44e-06, "loss": 0.0182, "reward": 1.078125, "reward_std": 0.783120796084404, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.4296875, "rewards/tag_count_reward": 0.625, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 761.3671875, "epoch": 0.0076, "grad_norm": 0.8725465574911918, "kl": 0.003345489501953125, "learning_rate": 1.52e-06, "loss": 0.0469, "reward": 0.8515625, "reward_std": 0.6071489304304123, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.328125, "rewards/tag_count_reward": 0.4765625, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 775.3125, "epoch": 0.008, "grad_norm": 0.7240484187357823, "kl": 0.0031757354736328125, "learning_rate": 1.6000000000000001e-06, "loss": 0.0496, "reward": 1.416015625, "reward_std": 0.6325365155935287, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.59375, "rewards/tag_count_reward": 0.744140625, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 854.0078125, "epoch": 0.0084, "grad_norm": 0.45831000837053243, "kl": 0.0055389404296875, "learning_rate": 1.6800000000000002e-06, "loss": 0.0145, "reward": 0.76171875, "reward_std": 0.5608582049608231, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.21875, "rewards/tag_count_reward": 0.42578125, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 870.2734375, "epoch": 0.0088, "grad_norm": 0.5743508454898445, "kl": 0.00562286376953125, "learning_rate": 1.76e-06, "loss": 0.0129, "reward": 0.802734375, "reward_std": 0.5962897464632988, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.1875, "rewards/tag_count_reward": 0.404296875, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 713.7421875, "epoch": 0.0092, "grad_norm": 1.4930997948758462, "kl": 0.0184326171875, "learning_rate": 1.8400000000000002e-06, "loss": 0.0385, "reward": 1.138671875, "reward_std": 0.6307023242115974, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.359375, "rewards/tag_count_reward": 0.591796875, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 677.9921875, "epoch": 0.0096, "grad_norm": 0.8560872817042297, "kl": 0.004001617431640625, "learning_rate": 1.9200000000000003e-06, "loss": 0.0421, "reward": 0.82421875, "reward_std": 0.7972037866711617, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.2578125, "rewards/tag_count_reward": 0.45703125, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 817.359375, "epoch": 0.01, "grad_norm": 0.9221592976582135, "kl": 0.009571075439453125, "learning_rate": 2.0000000000000003e-06, "loss": 0.0208, "reward": 0.9765625, "reward_std": 0.7448011413216591, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.2890625, "rewards/tag_count_reward": 0.53125, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 791.0234375, "epoch": 0.0104, "grad_norm": 0.7546047439196559, "kl": 0.011016845703125, "learning_rate": 2.08e-06, "loss": 0.0458, "reward": 0.8828125, "reward_std": 0.6309758052229881, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.2578125, "rewards/tag_count_reward": 0.5390625, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 824.375, "epoch": 0.0108, "grad_norm": 0.5633419265905922, "kl": 0.01001739501953125, "learning_rate": 2.16e-06, "loss": 0.025, "reward": 1.11328125, "reward_std": 0.49964234232902527, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.3515625, "rewards/tag_count_reward": 0.55859375, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 741.296875, "epoch": 0.0112, "grad_norm": 0.6944160112458072, "kl": 0.00882720947265625, "learning_rate": 2.24e-06, "loss": -0.0132, "reward": 1.037109375, "reward_std": 0.6358464658260345, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.3984375, "rewards/tag_count_reward": 0.623046875, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 800.078125, "epoch": 0.0116, "grad_norm": 4.586105097562267, "kl": 0.05352783203125, "learning_rate": 2.3200000000000002e-06, "loss": 0.0421, "reward": 1.390625, "reward_std": 0.6054098457098007, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.5703125, "rewards/tag_count_reward": 0.7734375, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 707.1484375, "epoch": 0.012, "grad_norm": 1.405540377507053, "kl": 0.036468505859375, "learning_rate": 2.4000000000000003e-06, "loss": 0.0638, "reward": 1.505859375, "reward_std": 0.5306108221411705, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.6484375, "rewards/tag_count_reward": 0.833984375, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 771.640625, "epoch": 0.0124, "grad_norm": 4.853294729484406, "kl": 0.0626983642578125, "learning_rate": 2.4800000000000004e-06, "loss": 0.0423, "reward": 1.44921875, "reward_std": 0.5903428271412849, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.578125, "rewards/tag_count_reward": 0.80859375, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 893.921875, "epoch": 0.0128, "grad_norm": 1.5154067981909143, "kl": 0.0326995849609375, "learning_rate": 2.56e-06, "loss": 0.0723, "reward": 1.2734375, "reward_std": 0.676678478717804, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.5078125, "rewards/tag_count_reward": 0.75, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 715.75, "epoch": 0.0132, "grad_norm": 3.6048915941850073, "kl": 0.0478973388671875, "learning_rate": 2.64e-06, "loss": 0.0688, "reward": 1.6484375, "reward_std": 0.4692293554544449, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.90625, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 852.3828125, "epoch": 0.0136, "grad_norm": 0.6331645320873488, "kl": 0.02523040771484375, "learning_rate": 2.7200000000000002e-06, "loss": 0.0557, "reward": 1.419921875, "reward_std": 0.6677964478731155, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.5078125, "rewards/tag_count_reward": 0.748046875, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 628.421875, "epoch": 0.014, "grad_norm": 404.58440492684235, "kl": 2.0081024169921875, "learning_rate": 2.8000000000000003e-06, "loss": 0.0982, "reward": 1.751953125, "reward_std": 0.3495968207716942, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.923828125, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 609.9609375, "epoch": 0.0144, "grad_norm": 0.9101962136281517, "kl": 0.02862548828125, "learning_rate": 2.88e-06, "loss": 0.0491, "reward": 1.86328125, "reward_std": 0.5431485027074814, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.91796875, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 749.171875, "epoch": 0.0148, "grad_norm": 0.7990645941280955, "kl": 0.016082763671875, "learning_rate": 2.96e-06, "loss": 0.0737, "reward": 1.658203125, "reward_std": 0.7382423877716064, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.6484375, "rewards/tag_count_reward": 0.798828125, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 817.390625, "epoch": 0.0152, "grad_norm": 1.6725972822795077, "kl": 0.0298919677734375, "learning_rate": 3.04e-06, "loss": 0.0235, "reward": 1.53125, "reward_std": 0.6619657725095749, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.65625, "rewards/tag_count_reward": 0.8125, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 802.609375, "epoch": 0.0156, "grad_norm": 0.46096703521578053, "kl": 0.014862060546875, "learning_rate": 3.12e-06, "loss": 0.0603, "reward": 1.330078125, "reward_std": 0.5491436049342155, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.546875, "rewards/tag_count_reward": 0.767578125, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 691.859375, "epoch": 0.016, "grad_norm": 0.6617744411768414, "kl": 0.01708984375, "learning_rate": 3.2000000000000003e-06, "loss": 0.0561, "reward": 1.6015625, "reward_std": 0.39237239584326744, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.8828125, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 653.6171875, "epoch": 0.0164, "grad_norm": 0.5540375290757477, "kl": 0.0152130126953125, "learning_rate": 3.2800000000000004e-06, "loss": 0.0537, "reward": 1.857421875, "reward_std": 0.5226252228021622, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.935546875, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 708.859375, "epoch": 0.0168, "grad_norm": 0.535049815644606, "kl": 0.02301025390625, "learning_rate": 3.3600000000000004e-06, "loss": 0.031, "reward": 1.5078125, "reward_std": 0.43379031121730804, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.7890625, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 653.2265625, "epoch": 0.0172, "grad_norm": 12.820933821247065, "kl": 0.19781494140625, "learning_rate": 3.44e-06, "loss": 0.0377, "reward": 1.75390625, "reward_std": 0.5086361281573772, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.88671875, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 568.578125, "epoch": 0.0176, "grad_norm": 2.187364087126418, "kl": 0.029296875, "learning_rate": 3.52e-06, "loss": 0.0553, "reward": 2.119140625, "reward_std": 0.29410287737846375, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.994140625, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 654.140625, "epoch": 0.018, "grad_norm": 1.054231873939232, "kl": 0.0277099609375, "learning_rate": 3.6000000000000003e-06, "loss": 0.0412, "reward": 1.68359375, "reward_std": 0.5287843346595764, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.87109375, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 682.125, "epoch": 0.0184, "grad_norm": 2.0052315403150778, "kl": 0.038970947265625, "learning_rate": 3.6800000000000003e-06, "loss": 0.0577, "reward": 2.07421875, "reward_std": 0.6216925233602524, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.87890625, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 595.875, "epoch": 0.0188, "grad_norm": 1.0995304255757818, "kl": 0.05413818359375, "learning_rate": 3.7600000000000004e-06, "loss": 0.0408, "reward": 1.91796875, "reward_std": 0.42454977333545685, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.91796875, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 579.1875, "epoch": 0.0192, "grad_norm": 36.502607241839044, "kl": 0.66455078125, "learning_rate": 3.8400000000000005e-06, "loss": 0.0832, "reward": 1.6328125, "reward_std": 0.4735468849539757, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.7109375, "rewards/tag_count_reward": 0.8515625, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 724.2578125, "epoch": 0.0196, "grad_norm": 3.750041742289201, "kl": 0.0772705078125, "learning_rate": 3.920000000000001e-06, "loss": 0.0492, "reward": 1.79296875, "reward_std": 0.5016569346189499, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.85546875, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 750.984375, "epoch": 0.02, "grad_norm": 0.7881254251575291, "kl": 0.06591796875, "learning_rate": 4.000000000000001e-06, "loss": 0.0669, "reward": 1.66015625, "reward_std": 0.4621131271123886, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.89453125, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 584.1484375, "epoch": 0.0204, "grad_norm": 18.653448508984855, "kl": 0.54345703125, "learning_rate": 4.08e-06, "loss": 0.0817, "reward": 1.79296875, "reward_std": 0.49891526252031326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.93359375, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 564.3046875, "epoch": 0.0208, "grad_norm": 2.054222400396483, "kl": 0.12890625, "learning_rate": 4.16e-06, "loss": 0.0609, "reward": 1.66796875, "reward_std": 0.602379709482193, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.87109375, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 649.03125, "epoch": 0.0212, "grad_norm": 2.1158871854329844, "kl": 0.0987548828125, "learning_rate": 4.24e-06, "loss": 0.0221, "reward": 1.95703125, "reward_std": 0.3976046293973923, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.95703125, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 700.1796875, "epoch": 0.0216, "grad_norm": 0.7123918061188437, "kl": 0.0792236328125, "learning_rate": 4.32e-06, "loss": 0.0358, "reward": 1.755859375, "reward_std": 0.55485038459301, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.865234375, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 610.0859375, "epoch": 0.022, "grad_norm": 1.3578027471922445, "kl": 0.07781982421875, "learning_rate": 4.4e-06, "loss": 0.036, "reward": 1.853515625, "reward_std": 0.5372420996427536, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.923828125, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 697.390625, "epoch": 0.0224, "grad_norm": 3.093795322120745, "kl": 0.235595703125, "learning_rate": 4.48e-06, "loss": 0.0771, "reward": 1.685546875, "reward_std": 0.5580805465579033, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.880859375, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 678.921875, "epoch": 0.0228, "grad_norm": 1.6176888127524798, "kl": 0.18804931640625, "learning_rate": 4.56e-06, "loss": 0.0451, "reward": 1.740234375, "reward_std": 0.4843317121267319, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.912109375, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 714.9140625, "epoch": 0.0232, "grad_norm": 5.306551396284782, "kl": 1.0615234375, "learning_rate": 4.6400000000000005e-06, "loss": 0.0826, "reward": 1.541015625, "reward_std": 0.5755985230207443, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6953125, "rewards/tag_count_reward": 0.845703125, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 602.9765625, "epoch": 0.0236, "grad_norm": 6.378551762817626, "kl": 0.47900390625, "learning_rate": 4.7200000000000005e-06, "loss": 0.0436, "reward": 1.765625, "reward_std": 0.45847922936081886, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.921875, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 637.515625, "epoch": 0.024, "grad_norm": 9.672307817634993, "kl": 0.79638671875, "learning_rate": 4.800000000000001e-06, "loss": 0.0485, "reward": 1.86328125, "reward_std": 0.4784998521208763, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.796875, "rewards/tag_count_reward": 0.92578125, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 654.65625, "epoch": 0.0244, "grad_norm": 1684.8852182036414, "kl": 49.65625, "learning_rate": 4.880000000000001e-06, "loss": 1.546, "reward": 1.986328125, "reward_std": 0.6733379065990448, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.939453125, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 688.8984375, "epoch": 0.0248, "grad_norm": 42.60204778055314, "kl": 4.640625, "learning_rate": 4.960000000000001e-06, "loss": 0.0927, "reward": 1.669921875, "reward_std": 0.5961694568395615, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.880859375, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 634.25, "epoch": 0.0252, "grad_norm": 5.104158693436992, "kl": 0.5811767578125, "learning_rate": 5.04e-06, "loss": 0.0302, "reward": 1.814453125, "reward_std": 0.3318721428513527, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.939453125, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 621.7421875, "epoch": 0.0256, "grad_norm": 22.824672682292917, "kl": 0.48388671875, "learning_rate": 5.12e-06, "loss": 0.0299, "reward": 1.693359375, "reward_std": 0.601629838347435, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.888671875, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 621.2421875, "epoch": 0.026, "grad_norm": 6.9491504549635845, "kl": 0.4981689453125, "learning_rate": 5.2e-06, "loss": 0.0107, "reward": 1.91015625, "reward_std": 0.536599799990654, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.94140625, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 629.03125, "epoch": 0.0264, "grad_norm": 5.4871449112421615, "kl": 1.74560546875, "learning_rate": 5.28e-06, "loss": 0.0128, "reward": 1.5546875, "reward_std": 0.5941050425171852, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.859375, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 600.359375, "epoch": 0.0268, "grad_norm": 13.476614004598828, "kl": 1.5546875, "learning_rate": 5.36e-06, "loss": 0.0969, "reward": 1.947265625, "reward_std": 0.4836155027151108, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.939453125, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 630.7890625, "epoch": 0.0272, "grad_norm": 14.182686728760256, "kl": 5.274658203125, "learning_rate": 5.4400000000000004e-06, "loss": 0.1013, "reward": 1.65234375, "reward_std": 0.5319748073816299, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.84765625, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 635.34375, "epoch": 0.0276, "grad_norm": 19.10683211507648, "kl": 5.095703125, "learning_rate": 5.5200000000000005e-06, "loss": 0.1335, "reward": 1.76171875, "reward_std": 0.5843851789832115, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.87890625, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 588.9375, "epoch": 0.028, "grad_norm": 9.888257528407012, "kl": 0.83935546875, "learning_rate": 5.600000000000001e-06, "loss": -0.0142, "reward": 1.765625, "reward_std": 0.630504697561264, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.8671875, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 651.4375, "epoch": 0.0284, "grad_norm": 9.324505940201398, "kl": 0.916015625, "learning_rate": 5.68e-06, "loss": 0.0171, "reward": 1.6484375, "reward_std": 0.6022404953837395, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.890625, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 693.1328125, "epoch": 0.0288, "grad_norm": 2.872866327972611, "kl": 2.2744140625, "learning_rate": 5.76e-06, "loss": 0.1068, "reward": 1.732421875, "reward_std": 0.6675616353750229, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.880859375, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 648.0859375, "epoch": 0.0292, "grad_norm": 8.640359909743772, "kl": 4.5986328125, "learning_rate": 5.84e-06, "loss": 0.1178, "reward": 1.53125, "reward_std": 0.5347791910171509, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.65625, "rewards/tag_count_reward": 0.875, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 578.6015625, "epoch": 0.0296, "grad_norm": 9.596137082420462, "kl": 5.078125, "learning_rate": 5.92e-06, "loss": 0.0886, "reward": 1.65234375, "reward_std": 0.544070228934288, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.89453125, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 643.796875, "epoch": 0.03, "grad_norm": 4.265905749743615, "kl": 1.86865234375, "learning_rate": 6e-06, "loss": 0.044, "reward": 1.6640625, "reward_std": 0.613326832652092, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7734375, "rewards/tag_count_reward": 0.890625, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 716.375, "epoch": 0.0304, "grad_norm": 8.126790155861093, "kl": 2.205078125, "learning_rate": 6.08e-06, "loss": -0.0001, "reward": 1.705078125, "reward_std": 0.5496121942996979, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.916015625, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 657.9375, "epoch": 0.0308, "grad_norm": 2.738082332752524, "kl": 1.8271484375, "learning_rate": 6.16e-06, "loss": 0.0951, "reward": 1.84375, "reward_std": 0.4469580873847008, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.8984375, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 695.90625, "epoch": 0.0312, "grad_norm": 65.84606044296459, "kl": 14.1640625, "learning_rate": 6.24e-06, "loss": 0.6613, "reward": 1.71484375, "reward_std": 0.6925430148839951, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.83203125, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 791.953125, "epoch": 0.0316, "grad_norm": 28.96597655302563, "kl": 10.5859375, "learning_rate": 6.3200000000000005e-06, "loss": 0.351, "reward": 1.341796875, "reward_std": 0.640436664223671, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6015625, "rewards/tag_count_reward": 0.740234375, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 767.8828125, "epoch": 0.032, "grad_norm": 6.73875763984808, "kl": 4.37109375, "learning_rate": 6.4000000000000006e-06, "loss": 0.1372, "reward": 1.529296875, "reward_std": 0.6793028190732002, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.640625, "rewards/tag_count_reward": 0.748046875, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 648.4296875, "epoch": 0.0324, "grad_norm": 6.137406632903599, "kl": 2.390625, "learning_rate": 6.480000000000001e-06, "loss": 0.1024, "reward": 1.82421875, "reward_std": 0.625511460006237, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.88671875, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 744.84375, "epoch": 0.0328, "grad_norm": 3.061107184613406, "kl": 1.705078125, "learning_rate": 6.560000000000001e-06, "loss": 0.1174, "reward": 1.517578125, "reward_std": 0.6658424884080887, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.6953125, "rewards/tag_count_reward": 0.814453125, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 647.1796875, "epoch": 0.0332, "grad_norm": 5.149174025360527, "kl": 2.328125, "learning_rate": 6.640000000000001e-06, "loss": 0.2837, "reward": 1.548828125, "reward_std": 0.7242977917194366, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.791015625, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 650.9765625, "epoch": 0.0336, "grad_norm": 4.785986178340687, "kl": 3.431640625, "learning_rate": 6.720000000000001e-06, "loss": 0.1739, "reward": 1.5, "reward_std": 0.7277457565069199, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.7109375, "rewards/tag_count_reward": 0.765625, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 680.5625, "epoch": 0.034, "grad_norm": 44.47732703232938, "kl": 5.41015625, "learning_rate": 6.800000000000001e-06, "loss": 0.3819, "reward": 1.662109375, "reward_std": 0.8110300898551941, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.755859375, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 705.40625, "epoch": 0.0344, "grad_norm": 10.581904738847351, "kl": 5.35546875, "learning_rate": 6.88e-06, "loss": 0.248, "reward": 1.431640625, "reward_std": 0.6760484576225281, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.6015625, "rewards/tag_count_reward": 0.681640625, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 648.0078125, "epoch": 0.0348, "grad_norm": 50.519349564680155, "kl": 3.033203125, "learning_rate": 6.96e-06, "loss": 0.211, "reward": 1.505859375, "reward_std": 0.585905596613884, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.755859375, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 649.515625, "epoch": 0.0352, "grad_norm": 3.4556874553257098, "kl": 1.70703125, "learning_rate": 7.04e-06, "loss": 0.1907, "reward": 1.37109375, "reward_std": 0.6692806780338287, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.640625, "rewards/tag_count_reward": 0.70703125, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 647.2421875, "epoch": 0.0356, "grad_norm": 2.1564460419296747, "kl": 1.6015625, "learning_rate": 7.1200000000000004e-06, "loss": 0.2515, "reward": 1.587890625, "reward_std": 0.6694190353155136, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.6484375, "rewards/tag_count_reward": 0.666015625, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 708.1328125, "epoch": 0.036, "grad_norm": 2.0754406215866803, "kl": 1.043701171875, "learning_rate": 7.2000000000000005e-06, "loss": 0.1886, "reward": 1.2421875, "reward_std": 0.6002995073795319, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.53125, "rewards/tag_count_reward": 0.6640625, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 763.3515625, "epoch": 0.0364, "grad_norm": 1.9158189231729492, "kl": 1.638671875, "learning_rate": 7.280000000000001e-06, "loss": 0.2275, "reward": 1.392578125, "reward_std": 0.6755427271127701, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.6171875, "rewards/tag_count_reward": 0.634765625, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 705.65625, "epoch": 0.0368, "grad_norm": 3.4544011372113337, "kl": 2.84765625, "learning_rate": 7.360000000000001e-06, "loss": 0.2649, "reward": 1.314453125, "reward_std": 0.7301520705223083, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.533203125, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 668.2109375, "epoch": 0.0372, "grad_norm": 4.192833306033655, "kl": 2.4091796875, "learning_rate": 7.440000000000001e-06, "loss": 0.2625, "reward": 1.501953125, "reward_std": 0.5573997050523758, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.650390625, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 598.2265625, "epoch": 0.0376, "grad_norm": 3.3568577784482287, "kl": 1.953125, "learning_rate": 7.520000000000001e-06, "loss": 0.2864, "reward": 1.546875, "reward_std": 0.6448578089475632, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.59375, "rewards/tag_count_reward": 0.71875, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 684.7890625, "epoch": 0.038, "grad_norm": 5.392626465260217, "kl": 2.20703125, "learning_rate": 7.600000000000001e-06, "loss": 0.2322, "reward": 1.638671875, "reward_std": 0.6955953985452652, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.669921875, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 796.7578125, "epoch": 0.0384, "grad_norm": 11.170730028171215, "kl": 4.69140625, "learning_rate": 7.680000000000001e-06, "loss": 0.2106, "reward": 0.9765625, "reward_std": 0.7731045484542847, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.359375, "rewards/tag_count_reward": 0.5859375, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 806.8125, "epoch": 0.0388, "grad_norm": 8.900274789859312, "kl": 2.68212890625, "learning_rate": 7.76e-06, "loss": 0.1345, "reward": 1.04296875, "reward_std": 0.7349528819322586, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.3359375, "rewards/tag_count_reward": 0.55078125, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 768.0234375, "epoch": 0.0392, "grad_norm": 1.692164783323472, "kl": 1.40283203125, "learning_rate": 7.840000000000001e-06, "loss": 0.1107, "reward": 1.10546875, "reward_std": 0.7223007827997208, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.4140625, "rewards/tag_count_reward": 0.65234375, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 710.203125, "epoch": 0.0396, "grad_norm": 13.79299167190569, "kl": 1.1083984375, "learning_rate": 7.92e-06, "loss": 0.1764, "reward": 1.73046875, "reward_std": 0.6537874341011047, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.86328125, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 690.7265625, "epoch": 0.04, "grad_norm": 10.342553930456331, "kl": 1.9609375, "learning_rate": 8.000000000000001e-06, "loss": 0.2086, "reward": 1.58984375, "reward_std": 0.6937143355607986, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.83203125, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 642.6328125, "epoch": 0.0404, "grad_norm": 6.035763409677234, "kl": 1.162109375, "learning_rate": 8.08e-06, "loss": 0.1754, "reward": 1.798828125, "reward_std": 0.4238819405436516, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.900390625, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 703.21875, "epoch": 0.0408, "grad_norm": 5.962513167238038, "kl": 2.865966796875, "learning_rate": 8.16e-06, "loss": 0.2843, "reward": 1.69921875, "reward_std": 0.625610426068306, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.86328125, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 792.4765625, "epoch": 0.0412, "grad_norm": 283.8654148820664, "kl": 72.5, "learning_rate": 8.24e-06, "loss": 2.9725, "reward": 1.171875, "reward_std": 0.812263086438179, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.4609375, "rewards/tag_count_reward": 0.6640625, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 715.625, "epoch": 0.0416, "grad_norm": 441.3816628068086, "kl": 46.84375, "learning_rate": 8.32e-06, "loss": 2.0409, "reward": 1.458984375, "reward_std": 0.7642089575529099, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.5703125, "rewards/tag_count_reward": 0.763671875, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 583.1640625, "epoch": 0.042, "grad_norm": 14.643690366941419, "kl": 4.109375, "learning_rate": 8.400000000000001e-06, "loss": 0.2244, "reward": 1.96875, "reward_std": 0.4918683245778084, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.90625, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 743.640625, "epoch": 0.0424, "grad_norm": 10.897853045315347, "kl": 3.490966796875, "learning_rate": 8.48e-06, "loss": 0.1693, "reward": 1.576171875, "reward_std": 0.5189465880393982, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.771484375, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 756.8203125, "epoch": 0.0428, "grad_norm": 2.3759158074077606, "kl": 0.61181640625, "learning_rate": 8.560000000000001e-06, "loss": 0.0759, "reward": 1.697265625, "reward_std": 0.5167545825242996, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.814453125, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 617.078125, "epoch": 0.0432, "grad_norm": 3.6834181151319623, "kl": 0.3450927734375, "learning_rate": 8.64e-06, "loss": 0.0935, "reward": 1.78515625, "reward_std": 0.5021463930606842, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.90234375, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 610.0, "epoch": 0.0436, "grad_norm": 6.412007091813123, "kl": 0.74267578125, "learning_rate": 8.720000000000001e-06, "loss": 0.1538, "reward": 1.69921875, "reward_std": 0.5158528387546539, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.88671875, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 670.515625, "epoch": 0.044, "grad_norm": 1045.8601411718619, "kl": 30.7025146484375, "learning_rate": 8.8e-06, "loss": 1.6144, "reward": 1.63671875, "reward_std": 0.4866725578904152, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.80859375, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 605.109375, "epoch": 0.0444, "grad_norm": 281.22174427770494, "kl": 13.8203125, "learning_rate": 8.880000000000001e-06, "loss": 0.9503, "reward": 1.751953125, "reward_std": 0.5370032861828804, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.900390625, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 706.3984375, "epoch": 0.0448, "grad_norm": 49.47751763411481, "kl": 7.19921875, "learning_rate": 8.96e-06, "loss": 0.4518, "reward": 1.517578125, "reward_std": 0.7504162490367889, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.6953125, "rewards/tag_count_reward": 0.783203125, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 651.046875, "epoch": 0.0452, "grad_norm": 12.908866269617977, "kl": 2.19140625, "learning_rate": 9.040000000000002e-06, "loss": 0.2263, "reward": 1.615234375, "reward_std": 0.6386354193091393, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.857421875, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 662.875, "epoch": 0.0456, "grad_norm": 12.855030496857571, "kl": 1.78857421875, "learning_rate": 9.12e-06, "loss": 0.2263, "reward": 1.76171875, "reward_std": 0.6534905880689621, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.84765625, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 610.21875, "epoch": 0.046, "grad_norm": 1.759000758408881, "kl": 0.74365234375, "learning_rate": 9.200000000000002e-06, "loss": 0.0219, "reward": 1.91796875, "reward_std": 0.5017628371715546, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.91796875, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 528.4296875, "epoch": 0.0464, "grad_norm": 3.6794775391631833, "kl": 1.7662353515625, "learning_rate": 9.280000000000001e-06, "loss": 0.0678, "reward": 1.8828125, "reward_std": 0.5054407902061939, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.8828125, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 731.9375, "epoch": 0.0468, "grad_norm": 1.7867651463339735, "kl": 0.609375, "learning_rate": 9.360000000000002e-06, "loss": 0.034, "reward": 1.634765625, "reward_std": 0.540661659091711, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.814453125, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 798.2734375, "epoch": 0.0472, "grad_norm": 6.947957479282318, "kl": 2.22265625, "learning_rate": 9.440000000000001e-06, "loss": 0.1878, "reward": 1.525390625, "reward_std": 0.6614315658807755, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.791015625, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 616.4296875, "epoch": 0.0476, "grad_norm": 7.116089050835072, "kl": 1.214599609375, "learning_rate": 9.52e-06, "loss": 0.1576, "reward": 1.779296875, "reward_std": 0.5260996222496033, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.912109375, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 618.0078125, "epoch": 0.048, "grad_norm": 4.016910715991107, "kl": 1.010498046875, "learning_rate": 9.600000000000001e-06, "loss": 0.1003, "reward": 1.86328125, "reward_std": 0.38497431576251984, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94140625, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 585.9921875, "epoch": 0.0484, "grad_norm": 2.975934268942709, "kl": 1.16455078125, "learning_rate": 9.68e-06, "loss": 0.1138, "reward": 1.91015625, "reward_std": 0.4655463844537735, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.88671875, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 701.609375, "epoch": 0.0488, "grad_norm": 4.955323814859439, "kl": 2.9873046875, "learning_rate": 9.760000000000001e-06, "loss": 0.2358, "reward": 1.873046875, "reward_std": 0.5975570827722549, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.857421875, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 740.984375, "epoch": 0.0492, "grad_norm": 48.505292839651425, "kl": 6.8046875, "learning_rate": 9.84e-06, "loss": 0.4198, "reward": 1.55859375, "reward_std": 0.7038843184709549, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.79296875, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 718.0078125, "epoch": 0.0496, "grad_norm": 10.546881899559548, "kl": 1.439453125, "learning_rate": 9.920000000000002e-06, "loss": 0.219, "reward": 1.677734375, "reward_std": 0.5449504777789116, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.826171875, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 648.9296875, "epoch": 0.05, "grad_norm": 36.16629747141123, "kl": 2.8896484375, "learning_rate": 1e-05, "loss": 0.3247, "reward": 1.541015625, "reward_std": 0.6435574218630791, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.783203125, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 754.7421875, "epoch": 0.0504, "grad_norm": 5.242839660646181, "kl": 1.337890625, "learning_rate": 1.008e-05, "loss": 0.192, "reward": 1.603515625, "reward_std": 0.6517375111579895, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.822265625, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 701.015625, "epoch": 0.0508, "grad_norm": 5.247041056662847, "kl": 1.129150390625, "learning_rate": 1.0160000000000001e-05, "loss": 0.1821, "reward": 1.693359375, "reward_std": 0.5904445648193359, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.873046875, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 606.9921875, "epoch": 0.0512, "grad_norm": 7.614552675332101, "kl": 2.1414794921875, "learning_rate": 1.024e-05, "loss": 0.2656, "reward": 1.716796875, "reward_std": 0.5228581354022026, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.888671875, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 656.9296875, "epoch": 0.0516, "grad_norm": 11.081251456919793, "kl": 0.95263671875, "learning_rate": 1.0320000000000001e-05, "loss": 0.1215, "reward": 1.94140625, "reward_std": 0.4980618953704834, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.91015625, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 691.671875, "epoch": 0.052, "grad_norm": 5.499485620668624, "kl": 2.013671875, "learning_rate": 1.04e-05, "loss": 0.2745, "reward": 1.833984375, "reward_std": 0.5770903006196022, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.873046875, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 683.7578125, "epoch": 0.0524, "grad_norm": 83.2017055182771, "kl": 9.65625, "learning_rate": 1.0480000000000001e-05, "loss": 0.5643, "reward": 1.80859375, "reward_std": 0.6098896637558937, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.86328125, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 717.03125, "epoch": 0.0528, "grad_norm": 197101.86176513802, "kl": 3717.7021484375, "learning_rate": 1.056e-05, "loss": 161.7858, "reward": 1.380859375, "reward_std": 0.769332155585289, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.546875, "rewards/tag_count_reward": 0.677734375, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 772.1015625, "epoch": 0.0532, "grad_norm": 116.5953030357046, "kl": 19.46875, "learning_rate": 1.0640000000000001e-05, "loss": 1.0604, "reward": 1.326171875, "reward_std": 0.8686842024326324, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.5625, "rewards/tag_count_reward": 0.701171875, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 731.2734375, "epoch": 0.0536, "grad_norm": 71.07373465740717, "kl": 12.3203125, "learning_rate": 1.072e-05, "loss": 0.7071, "reward": 1.423828125, "reward_std": 0.7525497823953629, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.6484375, "rewards/tag_count_reward": 0.751953125, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 611.34375, "epoch": 0.054, "grad_norm": 5.873722606864585, "kl": 1.3681640625, "learning_rate": 1.0800000000000002e-05, "loss": 0.1598, "reward": 1.626953125, "reward_std": 0.7181145399808884, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.814453125, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 671.6015625, "epoch": 0.0544, "grad_norm": 5.608384596597416, "kl": 0.5908203125, "learning_rate": 1.0880000000000001e-05, "loss": 0.1135, "reward": 1.7109375, "reward_std": 0.6418382078409195, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.8671875, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 794.71875, "epoch": 0.0548, "grad_norm": 2.790111536466758, "kl": 1.1513671875, "learning_rate": 1.0960000000000002e-05, "loss": 0.093, "reward": 1.513671875, "reward_std": 0.8384787142276764, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.65625, "rewards/tag_count_reward": 0.755859375, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 676.3046875, "epoch": 0.0552, "grad_norm": 8.916661272509268, "kl": 2.2587890625, "learning_rate": 1.1040000000000001e-05, "loss": 0.1996, "reward": 1.53515625, "reward_std": 0.7422526478767395, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.80078125, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 699.8125, "epoch": 0.0556, "grad_norm": 9.087969191294773, "kl": 1.853515625, "learning_rate": 1.1120000000000002e-05, "loss": 0.1449, "reward": 1.451171875, "reward_std": 0.6166361421346664, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.755859375, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 620.734375, "epoch": 0.056, "grad_norm": 2.6591438702104875, "kl": 0.878662109375, "learning_rate": 1.1200000000000001e-05, "loss": 0.1511, "reward": 1.767578125, "reward_std": 0.6695654392242432, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.853515625, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 728.2734375, "epoch": 0.0564, "grad_norm": 2.3151070850892785, "kl": 2.62109375, "learning_rate": 1.128e-05, "loss": 0.303, "reward": 1.14453125, "reward_std": 0.766977995634079, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.5, "rewards/tag_count_reward": 0.63671875, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 757.9609375, "epoch": 0.0568, "grad_norm": 192.66471950542828, "kl": 11.41748046875, "learning_rate": 1.136e-05, "loss": 0.6321, "reward": 1.19921875, "reward_std": 0.6529610455036163, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.4453125, "rewards/tag_count_reward": 0.61328125, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 762.234375, "epoch": 0.0572, "grad_norm": 12.866972675073473, "kl": 2.994140625, "learning_rate": 1.144e-05, "loss": 0.3241, "reward": 1.10546875, "reward_std": 0.6906668692827225, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.4140625, "rewards/tag_count_reward": 0.56640625, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 815.875, "epoch": 0.0576, "grad_norm": 4.059920456659772, "kl": 2.51953125, "learning_rate": 1.152e-05, "loss": 0.3024, "reward": 0.82421875, "reward_std": 0.6305588409304619, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.2421875, "rewards/tag_count_reward": 0.45703125, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 904.09375, "epoch": 0.058, "grad_norm": 3.1005614632330345, "kl": 2.2578125, "learning_rate": 1.16e-05, "loss": 0.1629, "reward": 0.39453125, "reward_std": 0.3172791190445423, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0703125, "rewards/tag_count_reward": 0.32421875, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 764.8046875, "epoch": 0.0584, "grad_norm": 2.099830534790427, "kl": 1.14453125, "learning_rate": 1.168e-05, "loss": 0.0835, "reward": 0.921875, "reward_std": 0.7855729162693024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.375, "rewards/tag_count_reward": 0.546875, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 654.1015625, "epoch": 0.0588, "grad_norm": 2.5838150257442414, "kl": 0.6376953125, "learning_rate": 1.1760000000000001e-05, "loss": 0.1784, "reward": 1.279296875, "reward_std": 0.7751196026802063, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.546875, "rewards/tag_count_reward": 0.677734375, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 710.1328125, "epoch": 0.0592, "grad_norm": 2.7258108197382773, "kl": 0.65234375, "learning_rate": 1.184e-05, "loss": 0.1247, "reward": 1.30859375, "reward_std": 0.7504727691411972, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.515625, "rewards/tag_count_reward": 0.66796875, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 763.1328125, "epoch": 0.0596, "grad_norm": 1.593855358832337, "kl": 0.89892578125, "learning_rate": 1.1920000000000001e-05, "loss": 0.2253, "reward": 1.291015625, "reward_std": 0.7713954299688339, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.5546875, "rewards/tag_count_reward": 0.697265625, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 874.2109375, "epoch": 0.06, "grad_norm": 2.827618206636587, "kl": 1.595703125, "learning_rate": 1.2e-05, "loss": 0.1834, "reward": 1.052734375, "reward_std": 0.756165474653244, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.359375, "rewards/tag_count_reward": 0.560546875, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 724.765625, "epoch": 0.0604, "grad_norm": 3.353243999450126, "kl": 1.072265625, "learning_rate": 1.2080000000000001e-05, "loss": 0.314, "reward": 1.42578125, "reward_std": 0.8308226764202118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.6015625, "rewards/tag_count_reward": 0.69921875, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 768.7578125, "epoch": 0.0608, "grad_norm": 3.7766427815072916, "kl": 0.72412109375, "learning_rate": 1.216e-05, "loss": 0.1319, "reward": 1.552734375, "reward_std": 0.6646496057510376, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.810546875, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 651.65625, "epoch": 0.0612, "grad_norm": 25.58057463908196, "kl": 0.47412109375, "learning_rate": 1.2240000000000001e-05, "loss": 0.1451, "reward": 2.07421875, "reward_std": 0.49661964923143387, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.91796875, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 601.5, "epoch": 0.0616, "grad_norm": 2.5807391265263484, "kl": 0.38818359375, "learning_rate": 1.232e-05, "loss": 0.1086, "reward": 1.91796875, "reward_std": 0.34925565868616104, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.94921875, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 583.109375, "epoch": 0.062, "grad_norm": 0.5261566216894528, "kl": 0.1376953125, "learning_rate": 1.2400000000000002e-05, "loss": 0.0212, "reward": 1.892578125, "reward_std": 0.30967047810554504, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.931640625, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 744.15625, "epoch": 0.0624, "grad_norm": 4.304069064016134, "kl": 1.36474609375, "learning_rate": 1.248e-05, "loss": 0.1498, "reward": 1.732421875, "reward_std": 0.6138521805405617, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.802734375, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 629.2109375, "epoch": 0.0628, "grad_norm": 2.2885239748819592, "kl": 0.3594970703125, "learning_rate": 1.2560000000000002e-05, "loss": 0.0433, "reward": 1.91796875, "reward_std": 0.43369776755571365, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.92578125, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 819.265625, "epoch": 0.0632, "grad_norm": 80.63353344781103, "kl": 9.3212890625, "learning_rate": 1.2640000000000001e-05, "loss": 0.503, "reward": 1.5078125, "reward_std": 0.6716418266296387, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.78125, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 794.25, "epoch": 0.0636, "grad_norm": 5.312289253767943, "kl": 0.929443359375, "learning_rate": 1.2720000000000002e-05, "loss": 0.1185, "reward": 1.828125, "reward_std": 0.647901713848114, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.8203125, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 687.0703125, "epoch": 0.064, "grad_norm": 1.9123853168239182, "kl": 0.1947021484375, "learning_rate": 1.2800000000000001e-05, "loss": 0.0332, "reward": 1.865234375, "reward_std": 0.4678259342908859, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.904296875, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 717.515625, "epoch": 0.0644, "grad_norm": 1.5844708650883155, "kl": 0.3385009765625, "learning_rate": 1.2880000000000002e-05, "loss": 0.0573, "reward": 1.7421875, "reward_std": 0.4795515537261963, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.8984375, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 665.015625, "epoch": 0.0648, "grad_norm": 11.357418676128582, "kl": 0.52392578125, "learning_rate": 1.2960000000000001e-05, "loss": 0.1401, "reward": 2.07421875, "reward_std": 0.6185346245765686, "rewards/accuracy_reward": 0.3671875, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.87890625, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 726.4921875, "epoch": 0.0652, "grad_norm": 86.93863891220593, "kl": 9.14111328125, "learning_rate": 1.3040000000000002e-05, "loss": 0.5361, "reward": 1.689453125, "reward_std": 0.5941673219203949, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.869140625, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 799.53125, "epoch": 0.0656, "grad_norm": 168.67990192927732, "kl": 17.918701171875, "learning_rate": 1.3120000000000001e-05, "loss": 0.922, "reward": 1.5703125, "reward_std": 0.7391193509101868, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.640625, "rewards/tag_count_reward": 0.7734375, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 750.2578125, "epoch": 0.066, "grad_norm": 3.342686476254917, "kl": 0.72900390625, "learning_rate": 1.3200000000000002e-05, "loss": 0.0917, "reward": 1.453125, "reward_std": 0.6508469060063362, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.65625, "rewards/tag_count_reward": 0.796875, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 681.46875, "epoch": 0.0664, "grad_norm": 8.255087647440291, "kl": 0.90673828125, "learning_rate": 1.3280000000000002e-05, "loss": 0.2014, "reward": 1.7109375, "reward_std": 0.7432717680931091, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.84375, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 697.8515625, "epoch": 0.0668, "grad_norm": 6.032960363193726, "kl": 0.44482421875, "learning_rate": 1.3360000000000003e-05, "loss": 0.0902, "reward": 2.064453125, "reward_std": 0.6085589230060577, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.916015625, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 746.3203125, "epoch": 0.0672, "grad_norm": 15.493885107983653, "kl": 2.888671875, "learning_rate": 1.3440000000000002e-05, "loss": 0.2591, "reward": 1.525390625, "reward_std": 0.6363618820905685, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.767578125, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 686.7578125, "epoch": 0.0676, "grad_norm": 11313.13323194109, "kl": 274.3603515625, "learning_rate": 1.3520000000000003e-05, "loss": 16.4375, "reward": 1.8203125, "reward_std": 0.6521976217627525, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.859375, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 735.328125, "epoch": 0.068, "grad_norm": 1.978535684711438, "kl": 0.9949951171875, "learning_rate": 1.3600000000000002e-05, "loss": 0.0903, "reward": 1.708984375, "reward_std": 0.577680915594101, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.826171875, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 684.7421875, "epoch": 0.0684, "grad_norm": 6.202479768658621, "kl": 1.0341796875, "learning_rate": 1.3680000000000003e-05, "loss": 0.2281, "reward": 1.89453125, "reward_std": 0.7825465202331543, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.7734375, "rewards/tag_count_reward": 0.83984375, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 662.4765625, "epoch": 0.0688, "grad_norm": 2.540159267102039, "kl": 0.9124755859375, "learning_rate": 1.376e-05, "loss": 0.1007, "reward": 1.970703125, "reward_std": 0.5036688968539238, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.884765625, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 740.75, "epoch": 0.0692, "grad_norm": 33.16007232335001, "kl": 6.646728515625, "learning_rate": 1.384e-05, "loss": 0.4556, "reward": 1.560546875, "reward_std": 0.6917675957083702, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.779296875, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 707.0703125, "epoch": 0.0696, "grad_norm": 7.731171907173846, "kl": 1.2822265625, "learning_rate": 1.392e-05, "loss": 0.1776, "reward": 1.865234375, "reward_std": 0.5722436010837555, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.888671875, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 693.609375, "epoch": 0.07, "grad_norm": 8.656907569782254, "kl": 3.911376953125, "learning_rate": 1.4e-05, "loss": 0.3254, "reward": 2.099609375, "reward_std": 0.6692389845848083, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.896484375, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 703.640625, "epoch": 0.0704, "grad_norm": 3.0394827149791896, "kl": 0.8709716796875, "learning_rate": 1.408e-05, "loss": 0.0953, "reward": 1.92578125, "reward_std": 0.5619572699069977, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.89453125, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 653.5546875, "epoch": 0.0708, "grad_norm": 227252.837631809, "kl": 4480.9324951171875, "learning_rate": 1.416e-05, "loss": 270.3076, "reward": 1.955078125, "reward_std": 0.413997121155262, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 735.8984375, "epoch": 0.0712, "grad_norm": 5.28652673497516, "kl": 2.7998046875, "learning_rate": 1.4240000000000001e-05, "loss": 0.2426, "reward": 1.78515625, "reward_std": 0.6022721156477928, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.86328125, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 628.7734375, "epoch": 0.0716, "grad_norm": 5.860226677890728, "kl": 2.3671875, "learning_rate": 1.432e-05, "loss": 0.2991, "reward": 2.080078125, "reward_std": 0.5285796374082565, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.923828125, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 697.84375, "epoch": 0.072, "grad_norm": 64.61645547289511, "kl": 10.0615234375, "learning_rate": 1.4400000000000001e-05, "loss": 0.7781, "reward": 1.939453125, "reward_std": 0.6099516376852989, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.876953125, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 700.2109375, "epoch": 0.0724, "grad_norm": 2.707640806276617, "kl": 0.4085693359375, "learning_rate": 1.448e-05, "loss": 0.1249, "reward": 2.056640625, "reward_std": 0.4133058860898018, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 691.7421875, "epoch": 0.0728, "grad_norm": 1.7194391261804929, "kl": 1.3177490234375, "learning_rate": 1.4560000000000001e-05, "loss": 0.1885, "reward": 1.8359375, "reward_std": 0.5127608776092529, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.9140625, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 741.78125, "epoch": 0.0732, "grad_norm": 176.74893524348997, "kl": 28.328125, "learning_rate": 1.464e-05, "loss": 1.7331, "reward": 1.9453125, "reward_std": 0.3989417627453804, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9453125, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 686.1171875, "epoch": 0.0736, "grad_norm": 8.614737502278674, "kl": 3.2635498046875, "learning_rate": 1.4720000000000001e-05, "loss": 0.2234, "reward": 1.978515625, "reward_std": 0.453658327460289, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.947265625, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 699.0078125, "epoch": 0.074, "grad_norm": 34.60398508686282, "kl": 1.9854736328125, "learning_rate": 1.48e-05, "loss": 0.1346, "reward": 2.001953125, "reward_std": 0.3636244460940361, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 654.7265625, "epoch": 0.0744, "grad_norm": 3.4709049032021504, "kl": 1.202392578125, "learning_rate": 1.4880000000000002e-05, "loss": 0.1926, "reward": 1.8984375, "reward_std": 0.5502865761518478, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.921875, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 722.953125, "epoch": 0.0748, "grad_norm": 8.356630924068327, "kl": 2.099609375, "learning_rate": 1.496e-05, "loss": 0.1306, "reward": 2.037109375, "reward_std": 0.2744537219405174, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 637.90625, "epoch": 0.0752, "grad_norm": 4.464061897059322, "kl": 2.37890625, "learning_rate": 1.5040000000000002e-05, "loss": 0.3186, "reward": 1.869140625, "reward_std": 0.6315409988164902, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.908203125, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 689.921875, "epoch": 0.0756, "grad_norm": 8.817356491604283, "kl": 2.609375, "learning_rate": 1.5120000000000001e-05, "loss": 0.3183, "reward": 1.802734375, "reward_std": 0.5654085278511047, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.857421875, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 603.2265625, "epoch": 0.076, "grad_norm": 27.677394323515966, "kl": 10.6781005859375, "learning_rate": 1.5200000000000002e-05, "loss": 0.6891, "reward": 2.263671875, "reward_std": 0.4801713675260544, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.896484375, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 656.0703125, "epoch": 0.0764, "grad_norm": 7.587383092029749, "kl": 5.189453125, "learning_rate": 1.5280000000000003e-05, "loss": 0.5188, "reward": 1.93359375, "reward_std": 0.6799550801515579, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.88671875, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 708.4765625, "epoch": 0.0768, "grad_norm": 8.112247791731301, "kl": 0.631103515625, "learning_rate": 1.5360000000000002e-05, "loss": 0.1915, "reward": 1.875, "reward_std": 0.5841406285762787, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.890625, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 743.3125, "epoch": 0.0772, "grad_norm": 1315.4993286671145, "kl": 21.07147216796875, "learning_rate": 1.544e-05, "loss": 1.3866, "reward": 1.7734375, "reward_std": 0.7037943303585052, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.8515625, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 732.1328125, "epoch": 0.0776, "grad_norm": 5.873456038194379, "kl": 1.4521484375, "learning_rate": 1.552e-05, "loss": 0.2323, "reward": 1.4296875, "reward_std": 0.5839791223406792, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.75, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 783.3203125, "epoch": 0.078, "grad_norm": 4.471009408880978, "kl": 3.361328125, "learning_rate": 1.5600000000000003e-05, "loss": 0.317, "reward": 1.751953125, "reward_std": 0.731633871793747, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.759765625, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 701.21875, "epoch": 0.0784, "grad_norm": 5.529640354254647, "kl": 2.890625, "learning_rate": 1.5680000000000002e-05, "loss": 0.345, "reward": 1.662109375, "reward_std": 0.7221723720431328, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.826171875, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 775.21875, "epoch": 0.0788, "grad_norm": 7.6653699515981835, "kl": 5.95703125, "learning_rate": 1.576e-05, "loss": 0.4792, "reward": 1.4375, "reward_std": 0.6279073059558868, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.59375, "rewards/tag_count_reward": 0.6953125, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 659.3125, "epoch": 0.0792, "grad_norm": 4.5597802762076265, "kl": 1.0234375, "learning_rate": 1.584e-05, "loss": 0.2381, "reward": 1.8203125, "reward_std": 0.48210127651691437, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.921875, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 686.546875, "epoch": 0.0796, "grad_norm": 3.6450828714633525, "kl": 1.468505859375, "learning_rate": 1.5920000000000003e-05, "loss": 0.1381, "reward": 2.01171875, "reward_std": 0.3937654122710228, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.95703125, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 639.0703125, "epoch": 0.08, "grad_norm": 230.51991170004055, "kl": 14.7940673828125, "learning_rate": 1.6000000000000003e-05, "loss": 1.5488, "reward": 1.875, "reward_std": 0.40892522037029266, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.8828125, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 695.734375, "epoch": 0.0804, "grad_norm": 86.03108162357051, "kl": 8.115234375, "learning_rate": 1.6080000000000002e-05, "loss": 0.6955, "reward": 2.029296875, "reward_std": 0.2618904709815979, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 557.609375, "epoch": 0.0808, "grad_norm": 1.316984923867534, "kl": 0.0892333984375, "learning_rate": 1.616e-05, "loss": 0.0208, "reward": 2.38671875, "reward_std": 0.2964007183909416, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 644.4296875, "epoch": 0.0812, "grad_norm": 7.146235020481302, "kl": 3.105224609375, "learning_rate": 1.6240000000000004e-05, "loss": 0.2545, "reward": 1.947265625, "reward_std": 0.3470195010304451, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.955078125, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 707.8359375, "epoch": 0.0816, "grad_norm": 0.33371698318366066, "kl": 0.09185791015625, "learning_rate": 1.632e-05, "loss": 0.0215, "reward": 1.990234375, "reward_std": 0.17806018888950348, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 508.0, "epoch": 0.082, "grad_norm": 2.005684840355363, "kl": 0.547607421875, "learning_rate": 1.64e-05, "loss": 0.1165, "reward": 2.1328125, "reward_std": 0.2659989222884178, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9609375, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 781.765625, "epoch": 0.0824, "grad_norm": 1.630980560792172, "kl": 0.248046875, "learning_rate": 1.648e-05, "loss": 0.08, "reward": 1.73828125, "reward_std": 0.5312443450093269, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.796875, "rewards/tag_count_reward": 0.85546875, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 686.2109375, "epoch": 0.0828, "grad_norm": 2.430219777335061, "kl": 0.890380859375, "learning_rate": 1.656e-05, "loss": 0.0873, "reward": 1.869140625, "reward_std": 0.373492494225502, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.939453125, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 712.1796875, "epoch": 0.0832, "grad_norm": 32.200003082113, "kl": 5.945556640625, "learning_rate": 1.664e-05, "loss": 0.3585, "reward": 2.080078125, "reward_std": 0.342342060059309, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.955078125, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 688.390625, "epoch": 0.0836, "grad_norm": 55.913576292414476, "kl": 10.3203125, "learning_rate": 1.672e-05, "loss": 0.6829, "reward": 1.61328125, "reward_std": 0.4491588920354843, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.6953125, "rewards/tag_count_reward": 0.81640625, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 676.953125, "epoch": 0.084, "grad_norm": 1.202751548147367, "kl": 0.565185546875, "learning_rate": 1.6800000000000002e-05, "loss": 0.0921, "reward": 1.66015625, "reward_std": 0.4314382076263428, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.91796875, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 657.0859375, "epoch": 0.0844, "grad_norm": 3.5637705987163604, "kl": 0.51806640625, "learning_rate": 1.688e-05, "loss": 0.1928, "reward": 1.859375, "reward_std": 0.5685551166534424, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.875, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 641.8984375, "epoch": 0.0848, "grad_norm": 0.3385863353669577, "kl": 0.114990234375, "learning_rate": 1.696e-05, "loss": 0.0312, "reward": 1.96875, "reward_std": 0.2890051081776619, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 678.421875, "epoch": 0.0852, "grad_norm": 0.34903628109966633, "kl": 0.1435546875, "learning_rate": 1.704e-05, "loss": 0.0127, "reward": 1.97265625, "reward_std": 0.2936744689941406, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 652.859375, "epoch": 0.0856, "grad_norm": 1.554004270395369, "kl": 0.64453125, "learning_rate": 1.7120000000000002e-05, "loss": 0.1087, "reward": 1.97265625, "reward_std": 0.41128628700971603, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.94140625, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 615.59375, "epoch": 0.086, "grad_norm": 0.3267895768624167, "kl": 0.1041259765625, "learning_rate": 1.72e-05, "loss": 0.0136, "reward": 1.96875, "reward_std": 0.2213020622730255, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 669.8671875, "epoch": 0.0864, "grad_norm": 8.355602101115542, "kl": 1.873291015625, "learning_rate": 1.728e-05, "loss": 0.115, "reward": 1.912109375, "reward_std": 0.51043251901865, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.927734375, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 770.3828125, "epoch": 0.0868, "grad_norm": 0.5386786170356803, "kl": 0.183349609375, "learning_rate": 1.736e-05, "loss": 0.0298, "reward": 1.71875, "reward_std": 0.44237203150987625, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.8671875, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 777.375, "epoch": 0.0872, "grad_norm": 0.3132561919117228, "kl": 0.0797119140625, "learning_rate": 1.7440000000000002e-05, "loss": 0.0378, "reward": 1.91015625, "reward_std": 0.5603981241583824, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.89453125, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 610.3828125, "epoch": 0.0876, "grad_norm": 2.695507788311956, "kl": 0.798095703125, "learning_rate": 1.752e-05, "loss": 0.0685, "reward": 1.9453125, "reward_std": 0.23985500633716583, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 719.0390625, "epoch": 0.088, "grad_norm": 0.3031073784531277, "kl": 0.0863037109375, "learning_rate": 1.76e-05, "loss": 0.0453, "reward": 1.947265625, "reward_std": 0.2943003475666046, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 738.3125, "epoch": 0.0884, "grad_norm": 0.423475391085179, "kl": 0.12353515625, "learning_rate": 1.768e-05, "loss": 0.0377, "reward": 1.89453125, "reward_std": 0.40020492672920227, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.88671875, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 657.5546875, "epoch": 0.0888, "grad_norm": 3.116009883442755, "kl": 0.933837890625, "learning_rate": 1.7760000000000003e-05, "loss": 0.0988, "reward": 1.99609375, "reward_std": 0.25167861208319664, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 739.2578125, "epoch": 0.0892, "grad_norm": 1.2173925313700404, "kl": 0.2686767578125, "learning_rate": 1.7840000000000002e-05, "loss": 0.0442, "reward": 2.0078125, "reward_std": 0.26654772460460663, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 772.9453125, "epoch": 0.0896, "grad_norm": 0.32390190788962087, "kl": 0.0916748046875, "learning_rate": 1.792e-05, "loss": 0.0208, "reward": 1.892578125, "reward_std": 0.3200267255306244, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.947265625, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 604.984375, "epoch": 0.09, "grad_norm": 7.805773291109405, "kl": 1.7957763671875, "learning_rate": 1.8e-05, "loss": 0.1564, "reward": 1.998046875, "reward_std": 0.3296370692551136, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 669.1015625, "epoch": 0.0904, "grad_norm": 3.4618675204437177, "kl": 1.41845703125, "learning_rate": 1.8080000000000003e-05, "loss": 0.1334, "reward": 1.921875, "reward_std": 0.2909228354692459, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 650.03125, "epoch": 0.0908, "grad_norm": 0.5891380957825096, "kl": 0.2684326171875, "learning_rate": 1.8160000000000002e-05, "loss": -0.0125, "reward": 2.111328125, "reward_std": 0.13118819519877434, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 634.25, "epoch": 0.0912, "grad_norm": 1.5559313658132976, "kl": 0.2474365234375, "learning_rate": 1.824e-05, "loss": 0.0702, "reward": 1.849609375, "reward_std": 0.1700693480670452, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.943359375, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 716.03125, "epoch": 0.0916, "grad_norm": 3.0964115056411554, "kl": 0.4217529296875, "learning_rate": 1.832e-05, "loss": 0.0917, "reward": 1.9375, "reward_std": 0.30835093557834625, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 702.0546875, "epoch": 0.092, "grad_norm": 1.4681309143245176, "kl": 1.1268310546875, "learning_rate": 1.8400000000000003e-05, "loss": 0.0786, "reward": 1.9609375, "reward_std": 0.26145318150520325, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 625.3984375, "epoch": 0.0924, "grad_norm": 2.1661021229509223, "kl": 0.5684814453125, "learning_rate": 1.8480000000000003e-05, "loss": 0.0736, "reward": 1.98046875, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 721.75, "epoch": 0.0928, "grad_norm": 57.59806635929127, "kl": 9.978515625, "learning_rate": 1.8560000000000002e-05, "loss": 0.5941, "reward": 1.87890625, "reward_std": 0.317934013903141, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 629.203125, "epoch": 0.0932, "grad_norm": 0.23887160010232372, "kl": 0.103515625, "learning_rate": 1.864e-05, "loss": 0.0186, "reward": 2.150390625, "reward_std": 0.14777223765850067, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 744.6171875, "epoch": 0.0936, "grad_norm": 3.1364355022739225, "kl": 0.828369140625, "learning_rate": 1.8720000000000004e-05, "loss": 0.0677, "reward": 1.904296875, "reward_std": 0.2649005129933357, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.951171875, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 683.2734375, "epoch": 0.094, "grad_norm": 1.6967636253264708, "kl": 0.6729736328125, "learning_rate": 1.88e-05, "loss": 0.0738, "reward": 1.765625, "reward_std": 0.34701254218816757, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.8828125, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 706.859375, "epoch": 0.0944, "grad_norm": 1.9679938691210672, "kl": 0.265869140625, "learning_rate": 1.8880000000000002e-05, "loss": 0.043, "reward": 1.96875, "reward_std": 0.168813094496727, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 687.5234375, "epoch": 0.0948, "grad_norm": 0.12800532297569778, "kl": 0.0806884765625, "learning_rate": 1.896e-05, "loss": 0.0114, "reward": 2.015625, "reward_std": 0.042695626616477966, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 625.421875, "epoch": 0.0952, "grad_norm": 0.3719334112721026, "kl": 0.0965576171875, "learning_rate": 1.904e-05, "loss": -0.0014, "reward": 2.240234375, "reward_std": 0.3020515665411949, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 747.3515625, "epoch": 0.0956, "grad_norm": 0.3327807701672144, "kl": 0.0870361328125, "learning_rate": 1.912e-05, "loss": 0.021, "reward": 1.919921875, "reward_std": 0.2885564714670181, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.943359375, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 638.765625, "epoch": 0.096, "grad_norm": 0.3348276657410682, "kl": 0.099609375, "learning_rate": 1.9200000000000003e-05, "loss": 0.0, "reward": 2.197265625, "reward_std": 0.23650237917900085, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 755.375, "epoch": 0.0964, "grad_norm": 0.2845133815432404, "kl": 0.0771484375, "learning_rate": 1.9280000000000002e-05, "loss": 0.0556, "reward": 2.078125, "reward_std": 0.40792985260486603, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 800.921875, "epoch": 0.0968, "grad_norm": 4.2983961728266085, "kl": 0.8828125, "learning_rate": 1.936e-05, "loss": 0.0767, "reward": 1.845703125, "reward_std": 0.4882144406437874, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.845703125, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 684.2734375, "epoch": 0.0972, "grad_norm": 0.138613740533147, "kl": 0.0870361328125, "learning_rate": 1.944e-05, "loss": 0.0054, "reward": 2.0234375, "reward_std": 0.050389111042022705, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 684.546875, "epoch": 0.0976, "grad_norm": 0.1769205821846829, "kl": 0.087158203125, "learning_rate": 1.9520000000000003e-05, "loss": 0.0253, "reward": 1.962890625, "reward_std": 0.11377985030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 692.2890625, "epoch": 0.098, "grad_norm": 0.9125824813863455, "kl": 0.13720703125, "learning_rate": 1.9600000000000002e-05, "loss": 0.0428, "reward": 2.0390625, "reward_std": 0.21875, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 714.8671875, "epoch": 0.0984, "grad_norm": 0.5983302748983784, "kl": 0.27978515625, "learning_rate": 1.968e-05, "loss": 0.0404, "reward": 1.9921875, "reward_std": 0.19148863852024078, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 746.9296875, "epoch": 0.0988, "grad_norm": 0.2741804508332127, "kl": 0.082763671875, "learning_rate": 1.976e-05, "loss": -0.0029, "reward": 2.099609375, "reward_std": 0.20939674973487854, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 672.9140625, "epoch": 0.0992, "grad_norm": 0.2215769797793813, "kl": 0.101318359375, "learning_rate": 1.9840000000000003e-05, "loss": 0.0148, "reward": 2.09765625, "reward_std": 0.08461953699588776, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 744.4453125, "epoch": 0.0996, "grad_norm": 0.46578066122837, "kl": 0.1822509765625, "learning_rate": 1.9920000000000002e-05, "loss": 0.0377, "reward": 2.064453125, "reward_std": 0.3240993767976761, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 720.4453125, "epoch": 0.1, "grad_norm": 0.2039466654430354, "kl": 0.1572265625, "learning_rate": 2e-05, "loss": 0.0273, "reward": 1.9921875, "reward_std": 0.1704520508646965, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 695.96875, "epoch": 0.1004, "grad_norm": 0.2526498505830934, "kl": 0.090576171875, "learning_rate": 1.9999990252244153e-05, "loss": 0.0142, "reward": 2.107421875, "reward_std": 0.24188994616270065, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 680.5546875, "epoch": 0.1008, "grad_norm": 0.28596323442536753, "kl": 0.0947265625, "learning_rate": 1.9999961008995607e-05, "loss": 0.0116, "reward": 1.978515625, "reward_std": 0.11377985030412674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 645.6328125, "epoch": 0.1012, "grad_norm": 0.7262853284027406, "kl": 0.09912109375, "learning_rate": 1.9999912270311376e-05, "loss": 0.0077, "reward": 2.173828125, "reward_std": 0.14998093992471695, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 665.4765625, "epoch": 0.1016, "grad_norm": 0.3456920898242669, "kl": 0.0799560546875, "learning_rate": 1.9999844036286483e-05, "loss": 0.0081, "reward": 2.078125, "reward_std": 0.30348586291074753, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 667.1796875, "epoch": 0.102, "grad_norm": 1.053414196918549, "kl": 0.1943359375, "learning_rate": 1.9999756307053947e-05, "loss": 0.0711, "reward": 2.013671875, "reward_std": 0.2698054909706116, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 708.8203125, "epoch": 0.1024, "grad_norm": 0.8586136289696347, "kl": 0.142333984375, "learning_rate": 1.9999649082784807e-05, "loss": 0.1081, "reward": 1.998046875, "reward_std": 0.362869068980217, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 789.265625, "epoch": 0.1028, "grad_norm": 1.6122658191433115, "kl": 0.4384765625, "learning_rate": 1.99995223636881e-05, "loss": 0.1951, "reward": 1.71875, "reward_std": 0.8110913038253784, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.78125, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 819.3359375, "epoch": 0.1032, "grad_norm": 1.7558832576327443, "kl": 0.53173828125, "learning_rate": 1.9999376150010868e-05, "loss": 0.1692, "reward": 1.53515625, "reward_std": 0.7376429736614227, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.80859375, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 835.390625, "epoch": 0.1036, "grad_norm": 3.4256923683475105, "kl": 0.7646484375, "learning_rate": 1.9999210442038164e-05, "loss": 0.2011, "reward": 1.220703125, "reward_std": 0.6945144012570381, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.5078125, "rewards/tag_count_reward": 0.634765625, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 770.109375, "epoch": 0.104, "grad_norm": 2.739333228127118, "kl": 0.54736328125, "learning_rate": 1.9999025240093045e-05, "loss": 0.1959, "reward": 1.619140625, "reward_std": 0.688667468726635, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.791015625, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 747.03125, "epoch": 0.1044, "grad_norm": 3.24929096852871, "kl": 0.47119140625, "learning_rate": 1.999882054453657e-05, "loss": 0.1938, "reward": 1.71484375, "reward_std": 0.8708623200654984, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.80078125, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 775.6015625, "epoch": 0.1048, "grad_norm": 0.9830382797138626, "kl": 0.27197265625, "learning_rate": 1.9998596355767805e-05, "loss": 0.1288, "reward": 1.61328125, "reward_std": 0.7062838971614838, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.80859375, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 693.5078125, "epoch": 0.1052, "grad_norm": 0.8713239140624717, "kl": 0.1884765625, "learning_rate": 1.9998352674223816e-05, "loss": 0.1861, "reward": 1.810546875, "reward_std": 0.5237614661455154, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.904296875, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 702.6484375, "epoch": 0.1056, "grad_norm": 1.5212095690382255, "kl": 0.353515625, "learning_rate": 1.999808950037968e-05, "loss": 0.163, "reward": 1.708984375, "reward_std": 0.510637603700161, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.865234375, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 770.7265625, "epoch": 0.106, "grad_norm": 3.569732287797024, "kl": 0.595947265625, "learning_rate": 1.9997806834748455e-05, "loss": 0.145, "reward": 1.52734375, "reward_std": 0.6711234524846077, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.79296875, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 740.7109375, "epoch": 0.1064, "grad_norm": 1.4225138430617863, "kl": 0.2877197265625, "learning_rate": 1.9997504677881224e-05, "loss": 0.146, "reward": 1.75390625, "reward_std": 0.4501335918903351, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.89453125, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 742.75, "epoch": 0.1068, "grad_norm": 0.5868894212429334, "kl": 0.15869140625, "learning_rate": 1.999718303036705e-05, "loss": 0.085, "reward": 1.86328125, "reward_std": 0.4100409746170044, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.91015625, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 720.2890625, "epoch": 0.1072, "grad_norm": 0.4883138727993324, "kl": 0.1171875, "learning_rate": 1.9996841892833e-05, "loss": 0.0833, "reward": 2.033203125, "reward_std": 0.4512961730360985, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 637.2578125, "epoch": 0.1076, "grad_norm": 0.4310432424678647, "kl": 0.121826171875, "learning_rate": 1.9996481265944146e-05, "loss": 0.0273, "reward": 2.228515625, "reward_std": 0.33507485687732697, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 707.34375, "epoch": 0.108, "grad_norm": 0.30435097043277154, "kl": 0.1123046875, "learning_rate": 1.9996101150403543e-05, "loss": 0.0577, "reward": 2.16015625, "reward_std": 0.41332750022411346, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.94921875, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 632.5390625, "epoch": 0.1084, "grad_norm": 0.347655130356992, "kl": 0.1217041015625, "learning_rate": 1.9995701546952252e-05, "loss": 0.0384, "reward": 2.205078125, "reward_std": 0.28443194925785065, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 669.15625, "epoch": 0.1088, "grad_norm": 0.2666942077669165, "kl": 0.1107177734375, "learning_rate": 1.9995282456369313e-05, "loss": 0.0256, "reward": 2.14453125, "reward_std": 0.17929667234420776, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 756.8359375, "epoch": 0.1092, "grad_norm": 0.3026511452333928, "kl": 0.1148681640625, "learning_rate": 1.999484387947177e-05, "loss": 0.0454, "reward": 2.046875, "reward_std": 0.24318469315767288, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 759.859375, "epoch": 0.1096, "grad_norm": 0.1616229377997104, "kl": 0.09716796875, "learning_rate": 1.9994385817114644e-05, "loss": 0.0193, "reward": 1.876953125, "reward_std": 0.167649507522583, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 785.7734375, "epoch": 0.11, "grad_norm": 0.27408961465013215, "kl": 0.103759765625, "learning_rate": 1.999390827019096e-05, "loss": 0.02, "reward": 2.21484375, "reward_std": 0.32107070088386536, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 699.9375, "epoch": 0.1104, "grad_norm": 0.2852489804113408, "kl": 0.10693359375, "learning_rate": 1.9993411239631713e-05, "loss": 0.0227, "reward": 2.1328125, "reward_std": 0.22867918759584427, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.984375, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 768.3046875, "epoch": 0.1108, "grad_norm": 0.31303707220072646, "kl": 0.1116943359375, "learning_rate": 1.9992894726405894e-05, "loss": 0.0368, "reward": 1.986328125, "reward_std": 0.1988266110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 784.8671875, "epoch": 0.1112, "grad_norm": 0.3014928856156214, "kl": 0.1080322265625, "learning_rate": 1.999235873152047e-05, "loss": 0.0215, "reward": 2.0, "reward_std": 0.267794169485569, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 730.859375, "epoch": 0.1116, "grad_norm": 0.14730331304783711, "kl": 0.1044921875, "learning_rate": 1.9991803256020393e-05, "loss": 0.0087, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 664.875, "epoch": 0.112, "grad_norm": 0.32514531187402623, "kl": 0.1055908203125, "learning_rate": 1.9991228300988586e-05, "loss": -0.0032, "reward": 2.087890625, "reward_std": 0.28241048753261566, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 735.375, "epoch": 0.1124, "grad_norm": 0.15743974539182193, "kl": 0.1043701171875, "learning_rate": 1.9990633867545956e-05, "loss": 0.0045, "reward": 2.15625, "reward_std": 0.055901698768138885, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 731.5546875, "epoch": 0.1128, "grad_norm": 0.3497762026746389, "kl": 0.1202392578125, "learning_rate": 1.9990019956851384e-05, "loss": -0.0106, "reward": 1.98828125, "reward_std": 0.26221735030412674, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 730.390625, "epoch": 0.1132, "grad_norm": 0.3381856532681638, "kl": 0.123046875, "learning_rate": 1.9989386570101716e-05, "loss": 0.0466, "reward": 2.02734375, "reward_std": 0.28885961323976517, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 731.890625, "epoch": 0.1136, "grad_norm": 0.3719913637718948, "kl": 0.1201171875, "learning_rate": 1.9988733708531772e-05, "loss": 0.0631, "reward": 2.244140625, "reward_std": 0.31491972506046295, "rewards/accuracy_reward": 0.3203125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 657.953125, "epoch": 0.114, "grad_norm": 0.3820599462458908, "kl": 0.13525390625, "learning_rate": 1.9988061373414342e-05, "loss": 0.0293, "reward": 2.16796875, "reward_std": 0.2222641110420227, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 731.953125, "epoch": 0.1144, "grad_norm": 0.2391882095223373, "kl": 0.1346435546875, "learning_rate": 1.998736956606018e-05, "loss": 0.0358, "reward": 2.0234375, "reward_std": 0.19850647449493408, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9609375, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 727.3515625, "epoch": 0.1148, "grad_norm": 0.6955944504048183, "kl": 0.14404296875, "learning_rate": 1.998665828781799e-05, "loss": 0.0582, "reward": 1.939453125, "reward_std": 0.3292652368545532, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 701.6484375, "epoch": 0.1152, "grad_norm": 0.2539507211394045, "kl": 0.145751953125, "learning_rate": 1.9985927540074453e-05, "loss": 0.0748, "reward": 1.87890625, "reward_std": 0.3174709975719452, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 606.703125, "epoch": 0.1156, "grad_norm": 0.5102145409554054, "kl": 0.15185546875, "learning_rate": 1.99851773242542e-05, "loss": 0.1334, "reward": 1.970703125, "reward_std": 0.4510403648018837, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 655.3828125, "epoch": 0.116, "grad_norm": 0.5282520378476349, "kl": 0.14208984375, "learning_rate": 1.9984407641819812e-05, "loss": 0.1095, "reward": 2.041015625, "reward_std": 0.4799608364701271, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 632.234375, "epoch": 0.1164, "grad_norm": 3633.922741932712, "kl": 171.09033203125, "learning_rate": 1.9983618494271825e-05, "loss": 10.3767, "reward": 1.97265625, "reward_std": 0.31771308928728104, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 601.625, "epoch": 0.1168, "grad_norm": 0.4992787028492908, "kl": 0.18310546875, "learning_rate": 1.998280988314872e-05, "loss": 0.1738, "reward": 1.8671875, "reward_std": 0.48877663910388947, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 631.4453125, "epoch": 0.1172, "grad_norm": 0.906340267459195, "kl": 0.26025390625, "learning_rate": 1.9981981810026932e-05, "loss": 0.2564, "reward": 1.7265625, "reward_std": 0.5672774612903595, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.8828125, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 635.09375, "epoch": 0.1176, "grad_norm": 0.9272980257264551, "kl": 0.2490234375, "learning_rate": 1.9981134276520828e-05, "loss": 0.1653, "reward": 1.810546875, "reward_std": 0.4966537281870842, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.912109375, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 646.140625, "epoch": 0.118, "grad_norm": 0.8917764863104739, "kl": 0.213623046875, "learning_rate": 1.9980267284282718e-05, "loss": 0.158, "reward": 1.87109375, "reward_std": 0.4117606207728386, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 638.0390625, "epoch": 0.1184, "grad_norm": 0.7053405293835856, "kl": 0.264404296875, "learning_rate": 1.9979380835002846e-05, "loss": 0.1455, "reward": 1.86328125, "reward_std": 0.4750836342573166, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.91796875, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 716.8125, "epoch": 0.1188, "grad_norm": 0.5458821926974888, "kl": 0.210693359375, "learning_rate": 1.9978474930409396e-05, "loss": 0.1106, "reward": 1.833984375, "reward_std": 0.39319124817848206, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.927734375, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 617.234375, "epoch": 0.1192, "grad_norm": 12.500047231597197, "kl": 0.72265625, "learning_rate": 1.997754957226847e-05, "loss": 0.1837, "reward": 1.916015625, "reward_std": 0.35422102361917496, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 666.359375, "epoch": 0.1196, "grad_norm": 1.5311428201852963, "kl": 0.3231201171875, "learning_rate": 1.99766047623841e-05, "loss": 0.0892, "reward": 1.88671875, "reward_std": 0.32089513540267944, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.95703125, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 658.640625, "epoch": 0.12, "grad_norm": 0.3230304532999706, "kl": 0.1380615234375, "learning_rate": 1.9975640502598243e-05, "loss": 0.026, "reward": 2.1875, "reward_std": 0.168813094496727, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 699.8515625, "epoch": 0.1204, "grad_norm": 1.6118024369048614, "kl": 0.17431640625, "learning_rate": 1.9974656794790777e-05, "loss": 0.0676, "reward": 2.025390625, "reward_std": 0.3304870054125786, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.962890625, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 638.4921875, "epoch": 0.1208, "grad_norm": 1.3095207300803389, "kl": 0.16796875, "learning_rate": 1.9973653640879486e-05, "loss": 0.1148, "reward": 1.951171875, "reward_std": 0.41860097646713257, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.951171875, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 605.171875, "epoch": 0.1212, "grad_norm": 0.550144526804644, "kl": 0.207763671875, "learning_rate": 1.997263104282007e-05, "loss": 0.1053, "reward": 1.97265625, "reward_std": 0.18981516361236572, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 708.7578125, "epoch": 0.1216, "grad_norm": 1.6183170341028068, "kl": 0.2890625, "learning_rate": 1.997158900260614e-05, "loss": 0.1151, "reward": 1.6875, "reward_std": 0.4736321344971657, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.8671875, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 708.2265625, "epoch": 0.122, "grad_norm": 0.23978486096826723, "kl": 0.15380859375, "learning_rate": 1.9970527522269204e-05, "loss": 0.0436, "reward": 1.951171875, "reward_std": 0.28486470878124237, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 657.234375, "epoch": 0.1224, "grad_norm": 0.575837611706063, "kl": 0.22021484375, "learning_rate": 1.9969446603878673e-05, "loss": 0.0327, "reward": 1.984375, "reward_std": 0.21902132779359818, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 793.5234375, "epoch": 0.1228, "grad_norm": 0.3066220900282079, "kl": 0.14453125, "learning_rate": 1.9968346249541848e-05, "loss": 0.0697, "reward": 1.888671875, "reward_std": 0.4929087460041046, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.904296875, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 681.2109375, "epoch": 0.1232, "grad_norm": 0.3218903863410939, "kl": 0.154296875, "learning_rate": 1.9967226461403934e-05, "loss": 0.0522, "reward": 1.953125, "reward_std": 0.30946769565343857, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 747.0234375, "epoch": 0.1236, "grad_norm": 0.3066905960668192, "kl": 0.14990234375, "learning_rate": 1.996608724164801e-05, "loss": 0.0597, "reward": 1.9296875, "reward_std": 0.2868804410099983, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 670.4296875, "epoch": 0.124, "grad_norm": 0.6397310409660981, "kl": 0.172119140625, "learning_rate": 1.9964928592495046e-05, "loss": 0.0404, "reward": 1.916015625, "reward_std": 0.28392383456230164, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 618.15625, "epoch": 0.1244, "grad_norm": 0.2505961654403157, "kl": 0.182373046875, "learning_rate": 1.9963750516203887e-05, "loss": 0.0596, "reward": 2.16796875, "reward_std": 0.17341843992471695, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 613.1875, "epoch": 0.1248, "grad_norm": 0.6335394931825246, "kl": 0.197509765625, "learning_rate": 1.996255301507125e-05, "loss": 0.0526, "reward": 2.1328125, "reward_std": 0.23207400739192963, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 682.140625, "epoch": 0.1252, "grad_norm": 0.13403676273645745, "kl": 0.165283203125, "learning_rate": 1.9961336091431728e-05, "loss": 0.0127, "reward": 1.943359375, "reward_std": 0.08854932337999344, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 590.3203125, "epoch": 0.1256, "grad_norm": 0.21400988374836266, "kl": 0.18359375, "learning_rate": 1.9960099747657774e-05, "loss": 0.0094, "reward": 2.037109375, "reward_std": 0.06765169650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 0.998046875, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 661.7578125, "epoch": 0.126, "grad_norm": 0.30344341829269633, "kl": 0.1904296875, "learning_rate": 1.9958843986159705e-05, "loss": 0.0115, "reward": 1.9921875, "reward_std": 0.20969334244728088, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 612.21875, "epoch": 0.1264, "grad_norm": 0.2736694936598004, "kl": 0.171630859375, "learning_rate": 1.9957568809385693e-05, "loss": -0.0104, "reward": 2.046875, "reward_std": 0.12433473765850067, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 692.40625, "epoch": 0.1268, "grad_norm": 0.08015395993992434, "kl": 0.16455078125, "learning_rate": 1.995627421982176e-05, "loss": 0.0145, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 709.21875, "epoch": 0.1272, "grad_norm": 0.1686760305833467, "kl": 0.18359375, "learning_rate": 1.995496021999177e-05, "loss": 0.0064, "reward": 2.015625, "reward_std": 0.042695626616477966, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 704.140625, "epoch": 0.1276, "grad_norm": 0.23856835972086354, "kl": 0.16650390625, "learning_rate": 1.995362681245744e-05, "loss": 0.0476, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 721.9453125, "epoch": 0.128, "grad_norm": 0.2388512205372218, "kl": 0.162353515625, "learning_rate": 1.9952273999818312e-05, "loss": 0.0151, "reward": 2.212890625, "reward_std": 0.1478717252612114, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 778.78125, "epoch": 0.1284, "grad_norm": 0.3083955271666192, "kl": 0.16259765625, "learning_rate": 1.9950901784711765e-05, "loss": 0.0219, "reward": 2.28515625, "reward_std": 0.22659722715616226, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 784.5859375, "epoch": 0.1288, "grad_norm": 0.22541147030540323, "kl": 0.163818359375, "learning_rate": 1.9949510169813006e-05, "loss": 0.0207, "reward": 1.97265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 734.5390625, "epoch": 0.1292, "grad_norm": 0.2813622455528445, "kl": 0.159423828125, "learning_rate": 1.994809915783505e-05, "loss": 0.0046, "reward": 2.236328125, "reward_std": 0.21248093992471695, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 685.9375, "epoch": 0.1296, "grad_norm": 3.4413725614837283, "kl": 0.17919921875, "learning_rate": 1.9946668751528745e-05, "loss": 0.0244, "reward": 2.125, "reward_std": 0.24747966974973679, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 713.3515625, "epoch": 0.13, "grad_norm": 0.2905577726159963, "kl": 0.163818359375, "learning_rate": 1.9945218953682736e-05, "loss": 0.0375, "reward": 2.03125, "reward_std": 0.1743273288011551, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 735.2421875, "epoch": 0.1304, "grad_norm": 0.2434759090870453, "kl": 0.149658203125, "learning_rate": 1.994374976712348e-05, "loss": 0.0218, "reward": 1.978515625, "reward_std": 0.1663404107093811, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 673.5546875, "epoch": 0.1308, "grad_norm": 0.2886287250345027, "kl": 0.146484375, "learning_rate": 1.9942261194715236e-05, "loss": 0.0016, "reward": 2.0390625, "reward_std": 0.13644562661647797, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 669.8125, "epoch": 0.1312, "grad_norm": 0.17399086190855462, "kl": 0.14111328125, "learning_rate": 1.9940753239360047e-05, "loss": 0.0175, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 679.7265625, "epoch": 0.1316, "grad_norm": 0.24416790786525028, "kl": 0.14990234375, "learning_rate": 1.9939225903997748e-05, "loss": 0.0047, "reward": 2.134765625, "reward_std": 0.1050766110420227, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 685.96875, "epoch": 0.132, "grad_norm": 0.2350603902389091, "kl": 0.16357421875, "learning_rate": 1.9937679191605964e-05, "loss": 0.0064, "reward": 2.142578125, "reward_std": 0.11058919876813889, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 653.6328125, "epoch": 0.1324, "grad_norm": 0.2515450568043921, "kl": 0.1494140625, "learning_rate": 1.9936113105200085e-05, "loss": 0.0157, "reward": 2.046875, "reward_std": 0.12433473765850067, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 633.53125, "epoch": 0.1328, "grad_norm": 0.36781638844417347, "kl": 0.148193359375, "learning_rate": 1.9934527647833276e-05, "loss": 0.0038, "reward": 2.0625, "reward_std": 0.2520497217774391, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 731.78125, "epoch": 0.1332, "grad_norm": 0.33650608635812, "kl": 0.1307373046875, "learning_rate": 1.993292282259647e-05, "loss": 0.0118, "reward": 2.0859375, "reward_std": 0.22753482311964035, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 675.3984375, "epoch": 0.1336, "grad_norm": 0.3072035903181854, "kl": 0.1341552734375, "learning_rate": 1.9931298632618355e-05, "loss": 0.0199, "reward": 2.12109375, "reward_std": 0.18332062661647797, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 660.90625, "epoch": 0.134, "grad_norm": 0.21842981123791194, "kl": 0.1280517578125, "learning_rate": 1.992965508106537e-05, "loss": 0.0397, "reward": 1.943359375, "reward_std": 0.22148846089839935, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 731.4140625, "epoch": 0.1344, "grad_norm": 0.20751159470250358, "kl": 0.1220703125, "learning_rate": 1.9927992171141707e-05, "loss": 0.0036, "reward": 2.0234375, "reward_std": 0.07394562661647797, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 757.1328125, "epoch": 0.1348, "grad_norm": 0.1876288689218191, "kl": 0.152099609375, "learning_rate": 1.992630990608929e-05, "loss": 0.0098, "reward": 2.15234375, "reward_std": 0.11911680549383163, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 758.4375, "epoch": 0.1352, "grad_norm": 0.2939663375095619, "kl": 0.132080078125, "learning_rate": 1.9924608289187786e-05, "loss": 0.0495, "reward": 2.044921875, "reward_std": 0.28836458921432495, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 707.8203125, "epoch": 0.1356, "grad_norm": 0.25510364604598035, "kl": 0.119873046875, "learning_rate": 1.992288732375458e-05, "loss": 0.0154, "reward": 2.04296875, "reward_std": 0.23171419650316238, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 660.6953125, "epoch": 0.136, "grad_norm": 0.2620102101354659, "kl": 0.1258544921875, "learning_rate": 1.9921147013144782e-05, "loss": 0.0146, "reward": 2.2265625, "reward_std": 0.17914125323295593, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 640.59375, "epoch": 0.1364, "grad_norm": 0.2840958918333972, "kl": 0.13134765625, "learning_rate": 1.9919387360751216e-05, "loss": 0.0219, "reward": 2.2421875, "reward_std": 0.2093738168478012, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 732.375, "epoch": 0.1368, "grad_norm": 0.18965578870357228, "kl": 0.129638671875, "learning_rate": 1.9917608370004417e-05, "loss": -0.0049, "reward": 2.078125, "reward_std": 0.10724534839391708, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 756.640625, "epoch": 0.1372, "grad_norm": 0.09947122933687046, "kl": 0.12939453125, "learning_rate": 1.9915810044372618e-05, "loss": 0.0099, "reward": 1.990234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 719.3984375, "epoch": 0.1376, "grad_norm": 0.20879583803192286, "kl": 0.114990234375, "learning_rate": 1.9913992387361747e-05, "loss": 0.0135, "reward": 2.103515625, "reward_std": 0.15607112646102905, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.994140625, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 749.6328125, "epoch": 0.138, "grad_norm": 0.23565958466074496, "kl": 0.12060546875, "learning_rate": 1.991215540251542e-05, "loss": 0.0295, "reward": 1.96484375, "reward_std": 0.17836953699588776, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 711.953125, "epoch": 0.1384, "grad_norm": 0.9267503996098491, "kl": 0.1318359375, "learning_rate": 1.991029909341493e-05, "loss": 0.0187, "reward": 2.095703125, "reward_std": 0.18847232311964035, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 702.3515625, "epoch": 0.1388, "grad_norm": 0.12249363799749703, "kl": 0.127197265625, "learning_rate": 1.9908423463679246e-05, "loss": 0.0077, "reward": 1.990234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 746.3203125, "epoch": 0.1392, "grad_norm": 1.9498753769636916, "kl": 0.1417236328125, "learning_rate": 1.990652851696501e-05, "loss": 0.0381, "reward": 1.97265625, "reward_std": 0.2161979004740715, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 667.3046875, "epoch": 0.1396, "grad_norm": 0.22661008018633827, "kl": 0.117431640625, "learning_rate": 1.9904614256966514e-05, "loss": 0.0142, "reward": 2.033203125, "reward_std": 0.16331055015325546, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.994140625, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 735.9453125, "epoch": 0.14, "grad_norm": 0.10514873846295204, "kl": 0.1201171875, "learning_rate": 1.9902680687415704e-05, "loss": 0.0127, "reward": 1.91796875, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 749.3359375, "epoch": 0.1404, "grad_norm": 0.25191381671883395, "kl": 0.1390380859375, "learning_rate": 1.9900727812082177e-05, "loss": 0.017, "reward": 1.9921875, "reward_std": 0.13644562661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 786.4296875, "epoch": 0.1408, "grad_norm": 0.2986796518610939, "kl": 0.14990234375, "learning_rate": 1.989875563477316e-05, "loss": 0.0338, "reward": 2.0390625, "reward_std": 0.26941104978322983, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 605.375, "epoch": 0.1412, "grad_norm": 0.07461842386337678, "kl": 0.12939453125, "learning_rate": 1.989676415933351e-05, "loss": 0.0051, "reward": 2.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 689.421875, "epoch": 0.1416, "grad_norm": 0.3210845584154317, "kl": 0.14111328125, "learning_rate": 1.9894753389645723e-05, "loss": 0.0272, "reward": 2.041015625, "reward_std": 0.19046786427497864, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 747.09375, "epoch": 0.142, "grad_norm": 0.3480933137765172, "kl": 0.1292724609375, "learning_rate": 1.9892723329629885e-05, "loss": 0.0454, "reward": 2.09375, "reward_std": 0.22883247584104538, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 669.3203125, "epoch": 0.1424, "grad_norm": 0.2019237147370717, "kl": 0.12109375, "learning_rate": 1.9890673983243708e-05, "loss": 0.0168, "reward": 1.984375, "reward_std": 0.09630206227302551, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 736.75, "epoch": 0.1428, "grad_norm": 2099516.1138803856, "kl": 19712.084228515625, "learning_rate": 1.9888605354482494e-05, "loss": 959.2102, "reward": 2.01171875, "reward_std": 0.17535917460918427, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 696.1171875, "epoch": 0.1432, "grad_norm": 83630.78767680221, "kl": 678.0614013671875, "learning_rate": 1.988651744737914e-05, "loss": 35.9214, "reward": 2.021484375, "reward_std": 0.2742908075451851, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 628.875, "epoch": 0.1436, "grad_norm": 1.6180221823833287, "kl": 0.217041015625, "learning_rate": 1.9884410266004134e-05, "loss": 0.0266, "reward": 2.013671875, "reward_std": 0.1350904181599617, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 623.390625, "epoch": 0.144, "grad_norm": 5086.392649398006, "kl": 37.01416015625, "learning_rate": 1.988228381446553e-05, "loss": 2.8947, "reward": 1.99609375, "reward_std": 0.1597641110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 674.8828125, "epoch": 0.1444, "grad_norm": 76511.34156916743, "kl": 4352.109130859375, "learning_rate": 1.9880138096908955e-05, "loss": 201.5031, "reward": 2.203125, "reward_std": 0.2634412720799446, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 628.3359375, "epoch": 0.1448, "grad_norm": 3.792953314198105, "kl": 1.086669921875, "learning_rate": 1.987797311751759e-05, "loss": 0.1158, "reward": 2.015625, "reward_std": 0.3417075276374817, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 722.2734375, "epoch": 0.1452, "grad_norm": 3.050116347464841, "kl": 0.8203125, "learning_rate": 1.9875788880512183e-05, "loss": 0.0438, "reward": 1.994140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 572.0625, "epoch": 0.1456, "grad_norm": 28.00813012631244, "kl": 2.47314453125, "learning_rate": 1.9873585390151003e-05, "loss": 0.1795, "reward": 2.03125, "reward_std": 0.2505621910095215, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 626.3984375, "epoch": 0.146, "grad_norm": 1.981707656454256, "kl": 0.25341796875, "learning_rate": 1.987136265072988e-05, "loss": 0.0633, "reward": 2.1015625, "reward_std": 0.17200922966003418, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 634.7265625, "epoch": 0.1464, "grad_norm": 18.823524543314136, "kl": 0.332275390625, "learning_rate": 1.9869120666582153e-05, "loss": 0.0453, "reward": 2.09375, "reward_std": 0.2691391110420227, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 678.453125, "epoch": 0.1468, "grad_norm": 10.915553167405216, "kl": 0.343017578125, "learning_rate": 1.986685944207868e-05, "loss": 0.058, "reward": 1.904296875, "reward_std": 0.21409358829259872, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 677.9140625, "epoch": 0.1472, "grad_norm": 7.254562398489455, "kl": 0.494384765625, "learning_rate": 1.9864578981627844e-05, "loss": 0.0654, "reward": 1.9921875, "reward_std": 0.2559390738606453, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 724.1015625, "epoch": 0.1476, "grad_norm": 79.07925576991067, "kl": 2.93505859375, "learning_rate": 1.986227928967551e-05, "loss": 0.1827, "reward": 1.9296875, "reward_std": 0.21930640190839767, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 691.0546875, "epoch": 0.148, "grad_norm": 0.21387402016947837, "kl": 0.1239013671875, "learning_rate": 1.985996037070505e-05, "loss": 0.0082, "reward": 2.337890625, "reward_std": 0.14675266295671463, "rewards/accuracy_reward": 0.3828125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 717.7421875, "epoch": 0.1484, "grad_norm": 2.6828621448413568, "kl": 1.02099609375, "learning_rate": 1.9857622229237315e-05, "loss": 0.1101, "reward": 2.041015625, "reward_std": 0.3847421854734421, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 746.5703125, "epoch": 0.1488, "grad_norm": 1.320587681712028, "kl": 0.615966796875, "learning_rate": 1.985526486983063e-05, "loss": 0.0565, "reward": 2.044921875, "reward_std": 0.22315485030412674, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 736.828125, "epoch": 0.1492, "grad_norm": 2266.2697248117456, "kl": 88.82080078125, "learning_rate": 1.985288829708079e-05, "loss": 4.932, "reward": 1.9140625, "reward_std": 0.3647398054599762, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 724.4453125, "epoch": 0.1496, "grad_norm": 2.4564163081868124, "kl": 0.418701171875, "learning_rate": 1.9850492515621038e-05, "loss": 0.0655, "reward": 1.935546875, "reward_std": 0.2291145622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 738.7734375, "epoch": 0.15, "grad_norm": 3.2331892287846804, "kl": 1.3251953125, "learning_rate": 1.9848077530122083e-05, "loss": 0.1284, "reward": 1.931640625, "reward_std": 0.33077291399240494, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 754.3203125, "epoch": 0.1504, "grad_norm": 4.942539988407213, "kl": 1.775390625, "learning_rate": 1.9845643345292055e-05, "loss": 0.1747, "reward": 2.044921875, "reward_std": 0.36605776846408844, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 754.5078125, "epoch": 0.1508, "grad_norm": 5.444934011375037, "kl": 1.039306640625, "learning_rate": 1.9843189965876525e-05, "loss": 0.085, "reward": 2.015625, "reward_std": 0.3543400317430496, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 744.6796875, "epoch": 0.1512, "grad_norm": 2.313985175271108, "kl": 0.410400390625, "learning_rate": 1.9840717396658483e-05, "loss": 0.0648, "reward": 2.107421875, "reward_std": 0.23800812661647797, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 738.328125, "epoch": 0.1516, "grad_norm": 1.2291053732751114, "kl": 0.4501953125, "learning_rate": 1.983822564245833e-05, "loss": 0.0478, "reward": 2.0703125, "reward_std": 0.31193412840366364, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 757.0390625, "epoch": 0.152, "grad_norm": 15023.416982723737, "kl": 1186.1650390625, "learning_rate": 1.983571470813386e-05, "loss": 62.868, "reward": 1.80859375, "reward_std": 0.4789893329143524, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.91796875, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 797.7890625, "epoch": 0.1524, "grad_norm": 177.9806483165382, "kl": 14.37109375, "learning_rate": 1.983318459858028e-05, "loss": 0.7286, "reward": 1.873046875, "reward_std": 0.3511757105588913, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.951171875, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 696.90625, "epoch": 0.1528, "grad_norm": 11.153004634615655, "kl": 2.379638671875, "learning_rate": 1.9830635318730155e-05, "loss": 0.1875, "reward": 2.001953125, "reward_std": 0.3180115148425102, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 722.8203125, "epoch": 0.1532, "grad_norm": 15.46383179005793, "kl": 0.706787109375, "learning_rate": 1.982806687355345e-05, "loss": 0.0908, "reward": 2.30078125, "reward_std": 0.39651254564523697, "rewards/accuracy_reward": 0.3828125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 824.34375, "epoch": 0.1536, "grad_norm": 2.2882504776671198, "kl": 0.55126953125, "learning_rate": 1.982547926805747e-05, "loss": 0.0712, "reward": 1.994140625, "reward_std": 0.4077717289328575, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.947265625, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 737.8125, "epoch": 0.154, "grad_norm": 3.363919546371468, "kl": 0.4661865234375, "learning_rate": 1.982287250728689e-05, "loss": 0.0746, "reward": 2.095703125, "reward_std": 0.22937870025634766, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 808.3203125, "epoch": 0.1544, "grad_norm": 2.8425624159778353, "kl": 1.0244140625, "learning_rate": 1.982024659632372e-05, "loss": 0.1054, "reward": 1.931640625, "reward_std": 0.48854973912239075, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.923828125, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 785.65625, "epoch": 0.1548, "grad_norm": 10.855938436390836, "kl": 3.833984375, "learning_rate": 1.981760154028731e-05, "loss": 0.2777, "reward": 1.818359375, "reward_std": 0.5420204401016235, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.912109375, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 750.5, "epoch": 0.1552, "grad_norm": 3.5374054030267494, "kl": 0.8853759765625, "learning_rate": 1.981493734433433e-05, "loss": 0.0911, "reward": 2.099609375, "reward_std": 0.39255570620298386, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 826.5, "epoch": 0.1556, "grad_norm": 123.27994100893913, "kl": 16.072265625, "learning_rate": 1.981225401365877e-05, "loss": 0.8326, "reward": 1.9375, "reward_std": 0.4664420709013939, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.8984375, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 742.171875, "epoch": 0.156, "grad_norm": 5.664382678911469, "kl": 0.79107666015625, "learning_rate": 1.9809551553491918e-05, "loss": 0.1329, "reward": 1.986328125, "reward_std": 0.4761813208460808, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 866.3515625, "epoch": 0.1564, "grad_norm": 13.518773779320568, "kl": 3.9404296875, "learning_rate": 1.9806829969102356e-05, "loss": 0.2947, "reward": 1.611328125, "reward_std": 0.7172112166881561, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.837890625, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 775.5625, "epoch": 0.1568, "grad_norm": 2.327241073090422, "kl": 1.4102783203125, "learning_rate": 1.980408926579596e-05, "loss": 0.1688, "reward": 1.75390625, "reward_std": 0.49514254927635193, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.88671875, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 822.546875, "epoch": 0.1572, "grad_norm": 2.886925901169088, "kl": 1.255859375, "learning_rate": 1.9801329448915863e-05, "loss": 0.1319, "reward": 1.919921875, "reward_std": 0.5664883553981781, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.896484375, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 806.328125, "epoch": 0.1576, "grad_norm": 4.654745427732301, "kl": 1.72265625, "learning_rate": 1.979855052384247e-05, "loss": 0.1985, "reward": 1.6640625, "reward_std": 0.6366118341684341, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.7109375, "rewards/tag_count_reward": 0.8203125, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 838.7265625, "epoch": 0.158, "grad_norm": 4.39088556550696, "kl": 1.296875, "learning_rate": 1.979575249599344e-05, "loss": 0.1869, "reward": 1.634765625, "reward_std": 0.6772811561822891, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7109375, "rewards/tag_count_reward": 0.798828125, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 833.671875, "epoch": 0.1584, "grad_norm": 2.2631446839599105, "kl": 0.9951171875, "learning_rate": 1.9792935370823676e-05, "loss": 0.1316, "reward": 1.71875, "reward_std": 0.6153614595532417, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.875, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 817.9921875, "epoch": 0.1588, "grad_norm": 1.807034356964978, "kl": 1.1103515625, "learning_rate": 1.97900991538253e-05, "loss": 0.1637, "reward": 1.89453125, "reward_std": 0.6827403753995895, "rewards/accuracy_reward": 0.3359375, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.82421875, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 766.65625, "epoch": 0.1592, "grad_norm": 3.6266531195057135, "kl": 1.04638671875, "learning_rate": 1.9787243850527663e-05, "loss": 0.1588, "reward": 1.953125, "reward_std": 0.5027559101581573, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.921875, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 728.4921875, "epoch": 0.1596, "grad_norm": 29.83761594905701, "kl": 4.2078857421875, "learning_rate": 1.9784369466497333e-05, "loss": 0.3067, "reward": 1.888671875, "reward_std": 0.3929111585021019, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.951171875, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 758.890625, "epoch": 0.16, "grad_norm": 0.25297363805323664, "kl": 0.0843505859375, "learning_rate": 1.9781476007338058e-05, "loss": 0.0312, "reward": 1.986328125, "reward_std": 0.1988266110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 683.8828125, "epoch": 0.1604, "grad_norm": 8.124641531470536, "kl": 0.791015625, "learning_rate": 1.977856347869079e-05, "loss": 0.1025, "reward": 2.0, "reward_std": 0.30263709276914597, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 628.640625, "epoch": 0.1608, "grad_norm": 2.05019043006238, "kl": 0.47412109375, "learning_rate": 1.9775631886233655e-05, "loss": 0.0572, "reward": 2.033203125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 739.0078125, "epoch": 0.1612, "grad_norm": 53.9005138720146, "kl": 3.2203369140625, "learning_rate": 1.9772681235681936e-05, "loss": 0.4355, "reward": 2.125, "reward_std": 0.24933473765850067, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 755.734375, "epoch": 0.1616, "grad_norm": 0.19303389598721876, "kl": 0.0975341796875, "learning_rate": 1.9769711532788083e-05, "loss": 0.0236, "reward": 2.052734375, "reward_std": 0.10361222177743912, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 639.4296875, "epoch": 0.162, "grad_norm": 1.8287510194160688, "kl": 0.169189453125, "learning_rate": 1.9766722783341682e-05, "loss": -0.0019, "reward": 2.009765625, "reward_std": 0.12726997584104538, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 773.4140625, "epoch": 0.1624, "grad_norm": 2.298597095690096, "kl": 0.8790283203125, "learning_rate": 1.976371499316945e-05, "loss": 0.059, "reward": 2.078125, "reward_std": 0.2954466789960861, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 709.28125, "epoch": 0.1628, "grad_norm": 311.8870615846556, "kl": 29.697265625, "learning_rate": 1.9760688168135233e-05, "loss": 1.3914, "reward": 2.005859375, "reward_std": 0.15068094432353973, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 686.40625, "epoch": 0.1632, "grad_norm": 0.31234016324412245, "kl": 0.119140625, "learning_rate": 1.9757642314139977e-05, "loss": 0.0184, "reward": 2.0234375, "reward_std": 0.25473497062921524, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 651.1953125, "epoch": 0.1636, "grad_norm": 5.008477374245358, "kl": 0.20849609375, "learning_rate": 1.9754577437121733e-05, "loss": 0.0187, "reward": 1.982421875, "reward_std": 0.13857005536556244, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 676.015625, "epoch": 0.164, "grad_norm": 0.41833823400729114, "kl": 0.121337890625, "learning_rate": 1.9751493543055634e-05, "loss": 0.0103, "reward": 2.01953125, "reward_std": 0.1990520879626274, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 697.125, "epoch": 0.1644, "grad_norm": 0.5312120493557465, "kl": 0.11328125, "learning_rate": 1.974839063795389e-05, "loss": 0.0173, "reward": 2.125, "reward_std": 0.1441391110420227, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 612.3359375, "epoch": 0.1648, "grad_norm": 0.5058383117915846, "kl": 0.114501953125, "learning_rate": 1.9745268727865774e-05, "loss": -0.0044, "reward": 2.017578125, "reward_std": 0.1756243333220482, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 759.0703125, "epoch": 0.1652, "grad_norm": 0.7667870545634549, "kl": 0.1199951171875, "learning_rate": 1.9742127818877605e-05, "loss": 0.0361, "reward": 1.921875, "reward_std": 0.22612712532281876, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9765625, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 764.3828125, "epoch": 0.1656, "grad_norm": 2.8639472973075493, "kl": 0.4608154296875, "learning_rate": 1.9738967917112752e-05, "loss": 0.1318, "reward": 2.064453125, "reward_std": 0.4050080478191376, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.955078125, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 702.0390625, "epoch": 0.166, "grad_norm": 15.2310189617219, "kl": 3.397216796875, "learning_rate": 1.9735789028731603e-05, "loss": 0.214, "reward": 1.88671875, "reward_std": 0.39959314465522766, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94921875, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 725.28125, "epoch": 0.1664, "grad_norm": 1.7318267544506127, "kl": 1.13427734375, "learning_rate": 1.9732591159931564e-05, "loss": 0.1025, "reward": 2.09765625, "reward_std": 0.42662497609853745, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.95703125, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 771.15625, "epoch": 0.1668, "grad_norm": 12.301697454164712, "kl": 9.3377685546875, "learning_rate": 1.972937431694704e-05, "loss": 0.1561, "reward": 2.166015625, "reward_std": 0.6427386105060577, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.908203125, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 811.2421875, "epoch": 0.1672, "grad_norm": 3.824241603736238, "kl": 0.421630859375, "learning_rate": 1.9726138506049438e-05, "loss": 0.0637, "reward": 1.869140625, "reward_std": 0.4588431939482689, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.923828125, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 715.6328125, "epoch": 0.1676, "grad_norm": 1.8625007522895727, "kl": 0.688232421875, "learning_rate": 1.9722883733547128e-05, "loss": 0.1296, "reward": 1.923828125, "reward_std": 0.41155726462602615, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.931640625, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 744.65625, "epoch": 0.168, "grad_norm": 0.542412320275697, "kl": 0.279296875, "learning_rate": 1.9719610005785466e-05, "loss": 0.0522, "reward": 1.9921875, "reward_std": 0.4422407001256943, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.921875, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 746.8359375, "epoch": 0.1684, "grad_norm": 2.538537570339363, "kl": 0.303955078125, "learning_rate": 1.971631732914674e-05, "loss": 0.0368, "reward": 1.974609375, "reward_std": 0.3529726639389992, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 762.296875, "epoch": 0.1688, "grad_norm": 7.147440661937373, "kl": 0.6630859375, "learning_rate": 1.9713005710050203e-05, "loss": 0.0789, "reward": 1.84375, "reward_std": 0.4205366149544716, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 804.0234375, "epoch": 0.1692, "grad_norm": 5.991393744301162, "kl": 0.8681640625, "learning_rate": 1.9709675154952017e-05, "loss": 0.0979, "reward": 1.884765625, "reward_std": 0.4647233486175537, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.892578125, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 714.359375, "epoch": 0.1696, "grad_norm": 2.784369720050774, "kl": 1.3369140625, "learning_rate": 1.9706325670345276e-05, "loss": 0.1553, "reward": 1.990234375, "reward_std": 0.4088764488697052, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 670.4765625, "epoch": 0.17, "grad_norm": 6.242210664857476, "kl": 0.677734375, "learning_rate": 1.9702957262759964e-05, "loss": 0.1257, "reward": 1.857421875, "reward_std": 0.45988186448812485, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 696.6015625, "epoch": 0.1704, "grad_norm": 2.712944212526007, "kl": 0.81640625, "learning_rate": 1.9699569938762975e-05, "loss": 0.1205, "reward": 1.88671875, "reward_std": 0.4911230057477951, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 749.9453125, "epoch": 0.1708, "grad_norm": 1.7226591813486125, "kl": 0.83544921875, "learning_rate": 1.969616370495806e-05, "loss": 0.0852, "reward": 1.734375, "reward_std": 0.513997495174408, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.890625, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 818.15625, "epoch": 0.1712, "grad_norm": 2.9064924089487643, "kl": 1.0849609375, "learning_rate": 1.9692738567985853e-05, "loss": 0.1128, "reward": 1.671875, "reward_std": 0.6460402011871338, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.8671875, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 705.5625, "epoch": 0.1716, "grad_norm": 1.3607440499111387, "kl": 0.58447265625, "learning_rate": 1.968929453452383e-05, "loss": 0.11, "reward": 1.912109375, "reward_std": 0.6003322601318359, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.912109375, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 715.265625, "epoch": 0.172, "grad_norm": 2.016075389732635, "kl": 0.524169921875, "learning_rate": 1.9685831611286312e-05, "loss": 0.1096, "reward": 1.783203125, "reward_std": 0.5414917469024658, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.908203125, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 734.109375, "epoch": 0.1724, "grad_norm": 1.1431626780355546, "kl": 0.2408447265625, "learning_rate": 1.9682349805024447e-05, "loss": 0.0531, "reward": 1.830078125, "reward_std": 0.40885648876428604, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.931640625, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 682.9609375, "epoch": 0.1728, "grad_norm": 1.0362079471350834, "kl": 0.216064453125, "learning_rate": 1.967884912252619e-05, "loss": 0.0458, "reward": 2.0703125, "reward_std": 0.45444805920124054, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 713.9296875, "epoch": 0.1732, "grad_norm": 7.0175852971778205, "kl": 1.9052734375, "learning_rate": 1.96753295706163e-05, "loss": 0.146, "reward": 1.740234375, "reward_std": 0.4144354909658432, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.912109375, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 780.5625, "epoch": 0.1736, "grad_norm": 4.758431383612995, "kl": 1.58203125, "learning_rate": 1.967179115615633e-05, "loss": 0.2164, "reward": 1.58203125, "reward_std": 0.8039745092391968, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.80859375, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 784.453125, "epoch": 0.174, "grad_norm": 1.04365802688044, "kl": 0.8505859375, "learning_rate": 1.9668233886044597e-05, "loss": 0.1263, "reward": 1.9296875, "reward_std": 0.6264050602912903, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.8828125, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 862.5390625, "epoch": 0.1744, "grad_norm": 0.8533372007499185, "kl": 0.444580078125, "learning_rate": 1.9664657767216176e-05, "loss": 0.0712, "reward": 1.546875, "reward_std": 0.636738657951355, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.8203125, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 836.796875, "epoch": 0.1748, "grad_norm": 1.4473494430233635, "kl": 0.53466796875, "learning_rate": 1.9661062806642903e-05, "loss": 0.0666, "reward": 1.87890625, "reward_std": 0.5433821007609367, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.90234375, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 840.0390625, "epoch": 0.1752, "grad_norm": 0.9049431435465403, "kl": 0.5855712890625, "learning_rate": 1.9657449011333328e-05, "loss": 0.1174, "reward": 1.9140625, "reward_std": 0.6835845857858658, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.875, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 752.4609375, "epoch": 0.1756, "grad_norm": 2.3250399188598276, "kl": 0.986328125, "learning_rate": 1.965381638833274e-05, "loss": 0.1966, "reward": 1.775390625, "reward_std": 0.7374754548072815, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.7734375, "rewards/tag_count_reward": 0.837890625, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 863.328125, "epoch": 0.176, "grad_norm": 2.3148743495075035, "kl": 0.48974609375, "learning_rate": 1.9650164944723116e-05, "loss": 0.0832, "reward": 1.84765625, "reward_std": 0.6493112444877625, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.87109375, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 782.046875, "epoch": 0.1764, "grad_norm": 2.144177892224825, "kl": 0.3477783203125, "learning_rate": 1.9646494687623135e-05, "loss": 0.0425, "reward": 1.93359375, "reward_std": 0.49504996836185455, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.88671875, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 703.9140625, "epoch": 0.1768, "grad_norm": 0.8741053382951127, "kl": 0.25830078125, "learning_rate": 1.964280562418815e-05, "loss": 0.0763, "reward": 2.16796875, "reward_std": 0.4758947119116783, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 786.53125, "epoch": 0.1772, "grad_norm": 0.603530287325241, "kl": 0.2706298828125, "learning_rate": 1.9639097761610174e-05, "loss": 0.0595, "reward": 2.0, "reward_std": 0.32349997758865356, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 742.4296875, "epoch": 0.1776, "grad_norm": 0.6732270060136167, "kl": 0.281005859375, "learning_rate": 1.963537110711789e-05, "loss": 0.0646, "reward": 1.958984375, "reward_std": 0.36037445440888405, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 816.046875, "epoch": 0.178, "grad_norm": 2.0001605546004875, "kl": 1.117431640625, "learning_rate": 1.9631625667976584e-05, "loss": 0.1025, "reward": 1.763671875, "reward_std": 0.35119638592004776, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.865234375, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 711.625, "epoch": 0.1784, "grad_norm": 0.2757687141665582, "kl": 0.200439453125, "learning_rate": 1.962786145148819e-05, "loss": 0.0303, "reward": 2.24609375, "reward_std": 0.18519414216279984, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 708.1171875, "epoch": 0.1788, "grad_norm": 0.1728076667132946, "kl": 0.143798828125, "learning_rate": 1.962407846499124e-05, "loss": 0.0181, "reward": 2.017578125, "reward_std": 0.11058919876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 593.046875, "epoch": 0.1792, "grad_norm": 0.2290258501219788, "kl": 0.19580078125, "learning_rate": 1.962027671586086e-05, "loss": 0.1019, "reward": 2.22265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 688.0546875, "epoch": 0.1796, "grad_norm": 2.0854171229470246, "kl": 0.282470703125, "learning_rate": 1.9616456211508756e-05, "loss": 0.0538, "reward": 2.01953125, "reward_std": 0.43394726514816284, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 626.6875, "epoch": 0.18, "grad_norm": 0.35066889202321333, "kl": 0.201171875, "learning_rate": 1.961261695938319e-05, "loss": 0.0189, "reward": 2.05859375, "reward_std": 0.18145641684532166, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 744.4453125, "epoch": 0.1804, "grad_norm": 0.3249450400715337, "kl": 0.1484375, "learning_rate": 1.9608758966968987e-05, "loss": 0.0185, "reward": 1.9921875, "reward_std": 0.13644562661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 755.609375, "epoch": 0.1808, "grad_norm": 0.35655325145532485, "kl": 0.186767578125, "learning_rate": 1.96048822417875e-05, "loss": 0.0331, "reward": 2.087890625, "reward_std": 0.08086346089839935, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 750.7890625, "epoch": 0.1812, "grad_norm": 0.7972428854518856, "kl": 0.229736328125, "learning_rate": 1.96009867913966e-05, "loss": 0.0464, "reward": 1.90625, "reward_std": 0.31466206908226013, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 732.71875, "epoch": 0.1816, "grad_norm": 1.201895419326617, "kl": 0.170166015625, "learning_rate": 1.9597072623390668e-05, "loss": 0.0527, "reward": 2.05859375, "reward_std": 0.20226941257715225, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 746.453125, "epoch": 0.182, "grad_norm": 0.1409046647181762, "kl": 0.11767578125, "learning_rate": 1.9593139745400575e-05, "loss": 0.0241, "reward": 2.078125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 776.875, "epoch": 0.1824, "grad_norm": 0.22651504880041412, "kl": 0.1103515625, "learning_rate": 1.958918816509367e-05, "loss": 0.0101, "reward": 1.8828125, "reward_std": 0.3188854530453682, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9296875, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 752.828125, "epoch": 0.1828, "grad_norm": 1.1946473010185878, "kl": 0.270263671875, "learning_rate": 1.958521789017376e-05, "loss": 0.0489, "reward": 1.99609375, "reward_std": 0.203236885368824, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 737.1171875, "epoch": 0.1832, "grad_norm": 0.384657466825161, "kl": 0.175537109375, "learning_rate": 1.95812289283811e-05, "loss": 0.0361, "reward": 2.212890625, "reward_std": 0.25363312661647797, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 709.4140625, "epoch": 0.1836, "grad_norm": 0.27137708902939617, "kl": 0.14501953125, "learning_rate": 1.9577221287492368e-05, "loss": 0.0119, "reward": 2.10546875, "reward_std": 0.078125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.98828125, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 708.40625, "epoch": 0.184, "grad_norm": 0.23614691453743542, "kl": 0.142578125, "learning_rate": 1.9573194975320672e-05, "loss": 0.0122, "reward": 2.095703125, "reward_std": 0.09738312661647797, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 752.25, "epoch": 0.1844, "grad_norm": 9.008189032144081, "kl": 1.2701416015625, "learning_rate": 1.9569149999715514e-05, "loss": 0.0855, "reward": 1.9453125, "reward_std": 0.19396330416202545, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.984375, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 676.0234375, "epoch": 0.1848, "grad_norm": 0.21894116043282177, "kl": 0.159423828125, "learning_rate": 1.956508636856278e-05, "loss": 0.007, "reward": 2.109375, "reward_std": 0.11022830754518509, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 644.4453125, "epoch": 0.1852, "grad_norm": 1.4814201164032539, "kl": 0.338134765625, "learning_rate": 1.9561004089784726e-05, "loss": 0.0504, "reward": 2.173828125, "reward_std": 0.23673021793365479, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 678.515625, "epoch": 0.1856, "grad_norm": 0.777736965619585, "kl": 0.24169921875, "learning_rate": 1.9556903171339963e-05, "loss": 0.0575, "reward": 2.099609375, "reward_std": 0.39426930621266365, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 688.7890625, "epoch": 0.186, "grad_norm": 0.7923808705462455, "kl": 0.19140625, "learning_rate": 1.9552783621223437e-05, "loss": 0.0264, "reward": 2.037109375, "reward_std": 0.24757513403892517, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 687.6796875, "epoch": 0.1864, "grad_norm": 8.882650533703625, "kl": 0.516357421875, "learning_rate": 1.9548645447466433e-05, "loss": 0.0238, "reward": 2.0859375, "reward_std": 0.17108610272407532, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.984375, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 774.9765625, "epoch": 0.1868, "grad_norm": 0.28445730115520207, "kl": 0.142578125, "learning_rate": 1.9544488658136522e-05, "loss": 0.0106, "reward": 1.98046875, "reward_std": 0.18304429948329926, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 769.1015625, "epoch": 0.1872, "grad_norm": 0.7983973463264955, "kl": 0.16552734375, "learning_rate": 1.954031326133758e-05, "loss": 0.0337, "reward": 2.26953125, "reward_std": 0.2153589427471161, "rewards/accuracy_reward": 0.3203125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 796.7734375, "epoch": 0.1876, "grad_norm": 0.8757191012516296, "kl": 0.17724609375, "learning_rate": 1.9536119265209763e-05, "loss": 0.0125, "reward": 2.02734375, "reward_std": 0.2977531850337982, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 821.4140625, "epoch": 0.188, "grad_norm": 0.5282975391301101, "kl": 0.213623046875, "learning_rate": 1.9531906677929472e-05, "loss": 0.0478, "reward": 1.85546875, "reward_std": 0.38836830854415894, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94921875, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 875.8515625, "epoch": 0.1884, "grad_norm": 0.35934522329122415, "kl": 0.156982421875, "learning_rate": 1.9527675507709368e-05, "loss": 0.0465, "reward": 2.013671875, "reward_std": 0.46308623254299164, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.943359375, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 894.8046875, "epoch": 0.1888, "grad_norm": 0.39409851562834, "kl": 0.21826171875, "learning_rate": 1.9523425762798328e-05, "loss": 0.0469, "reward": 1.759765625, "reward_std": 0.5556391850113869, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.884765625, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 772.8671875, "epoch": 0.1892, "grad_norm": 0.28565929297283105, "kl": 0.169189453125, "learning_rate": 1.9519157451481453e-05, "loss": 0.0248, "reward": 2.060546875, "reward_std": 0.2508021518588066, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.990234375, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 685.5859375, "epoch": 0.1896, "grad_norm": 0.35020185086024325, "kl": 0.184326171875, "learning_rate": 1.951487058208003e-05, "loss": 0.0224, "reward": 2.119140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 679.046875, "epoch": 0.19, "grad_norm": 0.4612516693209766, "kl": 0.232666015625, "learning_rate": 1.9510565162951538e-05, "loss": 0.054, "reward": 2.072265625, "reward_std": 0.22751451283693314, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 655.1953125, "epoch": 0.1904, "grad_norm": 17.07458789181295, "kl": 1.35107421875, "learning_rate": 1.95062412024896e-05, "loss": 0.2204, "reward": 2.169921875, "reward_std": 0.5486999005079269, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.927734375, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 634.328125, "epoch": 0.1908, "grad_norm": 176.02515906594041, "kl": 16.951171875, "learning_rate": 1.950189870912401e-05, "loss": 0.7926, "reward": 1.955078125, "reward_std": 0.28793276846408844, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 627.4453125, "epoch": 0.1912, "grad_norm": 1.1554344549210398, "kl": 0.3974609375, "learning_rate": 1.949753769132067e-05, "loss": 0.1584, "reward": 2.08984375, "reward_std": 0.4290042743086815, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 623.0, "epoch": 0.1916, "grad_norm": 168593.85032371106, "kl": 16512.14111328125, "learning_rate": 1.9493158157581617e-05, "loss": 968.2107, "reward": 2.138671875, "reward_std": 0.23569566011428833, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 561.1875, "epoch": 0.192, "grad_norm": 0.7353045002437669, "kl": 0.20751953125, "learning_rate": 1.9488760116444966e-05, "loss": 0.0313, "reward": 2.033203125, "reward_std": 0.2759895622730255, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 620.5390625, "epoch": 0.1924, "grad_norm": 0.24181773231863485, "kl": 0.17431640625, "learning_rate": 1.9484343576484935e-05, "loss": 0.0293, "reward": 2.119140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 620.140625, "epoch": 0.1928, "grad_norm": 0.6697528295162255, "kl": 0.170166015625, "learning_rate": 1.9479908546311783e-05, "loss": 0.0451, "reward": 1.966796875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 669.8359375, "epoch": 0.1932, "grad_norm": 0.14811474478592196, "kl": 0.15576171875, "learning_rate": 1.947545503457184e-05, "loss": 0.0078, "reward": 1.994140625, "reward_std": 0.06476997584104538, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 678.765625, "epoch": 0.1936, "grad_norm": 23.970143856127997, "kl": 0.217041015625, "learning_rate": 1.9470983049947446e-05, "loss": 0.04, "reward": 1.919921875, "reward_std": 0.18663493543863297, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 722.703125, "epoch": 0.194, "grad_norm": 2.465610544734644, "kl": 0.255859375, "learning_rate": 1.9466492601156964e-05, "loss": 0.147, "reward": 1.736328125, "reward_std": 0.574606865644455, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.7734375, "rewards/tag_count_reward": 0.830078125, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 671.7578125, "epoch": 0.1944, "grad_norm": 0.6643908514096619, "kl": 0.21044921875, "learning_rate": 1.946198369695476e-05, "loss": 0.0921, "reward": 1.765625, "reward_std": 0.3700053542852402, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.8984375, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 656.2109375, "epoch": 0.1948, "grad_norm": 1.0024290925094403, "kl": 0.185546875, "learning_rate": 1.945745634613117e-05, "loss": 0.0855, "reward": 1.84765625, "reward_std": 0.4032595455646515, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 698.9609375, "epoch": 0.1952, "grad_norm": 2.47563779003952, "kl": 0.3271484375, "learning_rate": 1.9452910557512497e-05, "loss": 0.2273, "reward": 1.666015625, "reward_std": 0.6689284443855286, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.845703125, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 738.0859375, "epoch": 0.1956, "grad_norm": 1.6973230119832947, "kl": 0.2333984375, "learning_rate": 1.9448346339960984e-05, "loss": 0.1195, "reward": 1.890625, "reward_std": 0.5439529865980148, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.90625, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 785.078125, "epoch": 0.196, "grad_norm": 154208286.27240637, "kl": 8142848.109863281, "learning_rate": 1.944376370237481e-05, "loss": 403458.3438, "reward": 1.708984375, "reward_std": 0.7009203881025314, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.841796875, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 764.171875, "epoch": 0.1964, "grad_norm": 3.4837176706789013, "kl": 0.376953125, "learning_rate": 1.9439162653688066e-05, "loss": 0.2017, "reward": 1.603515625, "reward_std": 0.7962769567966461, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.806640625, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 899.3125, "epoch": 0.1968, "grad_norm": 11.603183207907334, "kl": 2.26806640625, "learning_rate": 1.9434543202870726e-05, "loss": 0.2746, "reward": 0.76171875, "reward_std": 0.6937493458390236, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.2578125, "rewards/tag_count_reward": 0.45703125, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 960.03125, "epoch": 0.1972, "grad_norm": 9.045872132386735, "kl": 0.8046875, "learning_rate": 1.9429905358928648e-05, "loss": 0.1489, "reward": 0.54296875, "reward_std": 0.4785940870642662, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.1640625, "rewards/tag_count_reward": 0.37890625, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 979.0625, "epoch": 0.1976, "grad_norm": 7.339492461414747, "kl": 1.517578125, "learning_rate": 1.9425249130903544e-05, "loss": 0.1677, "reward": 0.64453125, "reward_std": 0.3466637656092644, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.078125, "rewards/tag_count_reward": 0.30859375, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 815.90625, "epoch": 0.198, "grad_norm": 4.381580414080547, "kl": 1.5869140625, "learning_rate": 1.942057452787297e-05, "loss": 0.3544, "reward": 0.890625, "reward_std": 0.6788236871361732, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.3515625, "rewards/tag_count_reward": 0.515625, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 834.328125, "epoch": 0.1984, "grad_norm": 24.15921073805826, "kl": 0.767578125, "learning_rate": 1.9415881558950302e-05, "loss": 0.2212, "reward": 1.16015625, "reward_std": 0.8119282722473145, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.515625, "rewards/tag_count_reward": 0.63671875, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 759.4765625, "epoch": 0.1988, "grad_norm": 1.7249267470498977, "kl": 0.213134765625, "learning_rate": 1.9411170233284728e-05, "loss": 0.0733, "reward": 1.79296875, "reward_std": 0.3618658259510994, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.89453125, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 779.0, "epoch": 0.1992, "grad_norm": 1.6288891220045851, "kl": 0.142333984375, "learning_rate": 1.9406440560061214e-05, "loss": 0.0833, "reward": 1.716796875, "reward_std": 0.4789683297276497, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.873046875, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 709.5390625, "epoch": 0.1996, "grad_norm": 0.5353496974324046, "kl": 0.1356201171875, "learning_rate": 1.9401692548500504e-05, "loss": 0.0894, "reward": 1.9765625, "reward_std": 0.448553130030632, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9375, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 673.0390625, "epoch": 0.2, "grad_norm": 13.137123389148018, "kl": 0.1136474609375, "learning_rate": 1.9396926207859085e-05, "loss": 0.0434, "reward": 2.04296875, "reward_std": 0.37542494386434555, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94140625, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 707.796875, "epoch": 0.2004, "grad_norm": 0.3478355654070266, "kl": 0.1314697265625, "learning_rate": 1.9392141547429183e-05, "loss": 0.0457, "reward": 1.853515625, "reward_std": 0.2978722006082535, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 678.1875, "epoch": 0.2008, "grad_norm": 0.29299064967182337, "kl": 0.1307373046875, "learning_rate": 1.9387338576538743e-05, "loss": 0.0448, "reward": 1.984375, "reward_std": 0.3247150704264641, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 726.9609375, "epoch": 0.2012, "grad_norm": 2.9209099417015345, "kl": 0.257568359375, "learning_rate": 1.9382517304551397e-05, "loss": 0.0733, "reward": 1.85546875, "reward_std": 0.3539557084441185, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 644.5, "epoch": 0.2016, "grad_norm": 0.281223893553984, "kl": 0.129150390625, "learning_rate": 1.937767774086646e-05, "loss": 0.0534, "reward": 2.01953125, "reward_std": 0.21018315106630325, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 538.984375, "epoch": 0.202, "grad_norm": 0.4731345418434688, "kl": 0.1336669921875, "learning_rate": 1.937281989491892e-05, "loss": 0.042, "reward": 2.1640625, "reward_std": 0.25348418205976486, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 528.296875, "epoch": 0.2024, "grad_norm": 0.25605788343623254, "kl": 0.1326904296875, "learning_rate": 1.936794377617938e-05, "loss": 0.0176, "reward": 2.0390625, "reward_std": 0.17035653442144394, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 624.828125, "epoch": 0.2028, "grad_norm": 0.24977991907861932, "kl": 0.12890625, "learning_rate": 1.9363049394154095e-05, "loss": 0.0277, "reward": 2.091796875, "reward_std": 0.0894516110420227, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 593.0390625, "epoch": 0.2032, "grad_norm": 1.2962987003587108, "kl": 0.1552734375, "learning_rate": 1.935813675838491e-05, "loss": 0.0179, "reward": 2.224609375, "reward_std": 0.26188716292381287, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.982421875, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 628.40625, "epoch": 0.2036, "grad_norm": 0.1767384872695702, "kl": 0.1611328125, "learning_rate": 1.935320587844926e-05, "loss": 0.009, "reward": 2.240234375, "reward_std": 0.15854563564062119, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 562.0703125, "epoch": 0.204, "grad_norm": 0.5685287534278881, "kl": 0.192138671875, "learning_rate": 1.9348256763960146e-05, "loss": 0.0393, "reward": 1.96484375, "reward_std": 0.16846735030412674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 527.3984375, "epoch": 0.2044, "grad_norm": 1.7920512638268666, "kl": 0.418212890625, "learning_rate": 1.9343289424566122e-05, "loss": 0.0405, "reward": 2.224609375, "reward_std": 0.24780654907226562, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 576.2109375, "epoch": 0.2048, "grad_norm": 0.5797830093069938, "kl": 0.172119140625, "learning_rate": 1.933830386995127e-05, "loss": 0.0094, "reward": 2.3359375, "reward_std": 0.08715169876813889, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 1.0, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 536.46875, "epoch": 0.2052, "grad_norm": 0.14134059774347324, "kl": 0.158935546875, "learning_rate": 1.9333300109835182e-05, "loss": 0.0086, "reward": 2.0234375, "reward_std": 0.050389111042022705, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 626.4921875, "epoch": 0.2056, "grad_norm": 0.5171898591655767, "kl": 0.167236328125, "learning_rate": 1.9328278153972947e-05, "loss": 0.02, "reward": 1.982421875, "reward_std": 0.17979396134614944, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 665.4609375, "epoch": 0.206, "grad_norm": 4.208206982595979, "kl": 0.253173828125, "learning_rate": 1.9323238012155125e-05, "loss": 0.0291, "reward": 2.04296875, "reward_std": 0.17341843992471695, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 646.4609375, "epoch": 0.2064, "grad_norm": 0.2975097660211389, "kl": 0.15673828125, "learning_rate": 1.9318179694207726e-05, "loss": 0.0307, "reward": 2.001953125, "reward_std": 0.09738312661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 700.21875, "epoch": 0.2068, "grad_norm": 0.17966279368730875, "kl": 0.154052734375, "learning_rate": 1.9313103209992205e-05, "loss": 0.006, "reward": 1.994140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 633.796875, "epoch": 0.2072, "grad_norm": 0.22313909983659644, "kl": 0.16552734375, "learning_rate": 1.9308008569405424e-05, "loss": 0.0063, "reward": 2.015625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 782.640625, "epoch": 0.2076, "grad_norm": 0.30933708923909103, "kl": 0.160888671875, "learning_rate": 1.9302895782379648e-05, "loss": 0.0073, "reward": 2.1875, "reward_std": 0.15358919650316238, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 715.1796875, "epoch": 0.208, "grad_norm": 0.28416820739717, "kl": 0.150390625, "learning_rate": 1.9297764858882516e-05, "loss": 0.0045, "reward": 2.029296875, "reward_std": 0.15682122111320496, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 750.84375, "epoch": 0.2084, "grad_norm": 0.20734722668804179, "kl": 0.14599609375, "learning_rate": 1.9292615808917027e-05, "loss": 0.0167, "reward": 2.099609375, "reward_std": 0.21747081726789474, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.982421875, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 783.9140625, "epoch": 0.2088, "grad_norm": 1.2168595932034443, "kl": 0.219970703125, "learning_rate": 1.9287448642521513e-05, "loss": 0.0646, "reward": 2.076171875, "reward_std": 0.36007464677095413, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 801.1171875, "epoch": 0.2092, "grad_norm": 0.4365887558614677, "kl": 0.185546875, "learning_rate": 1.9282263369769633e-05, "loss": 0.0261, "reward": 2.044921875, "reward_std": 0.18434104323387146, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.974609375, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 718.8515625, "epoch": 0.2096, "grad_norm": 0.5781531793918995, "kl": 0.17578125, "learning_rate": 1.9277060000770342e-05, "loss": 0.0559, "reward": 1.916015625, "reward_std": 0.23382875323295593, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 769.0859375, "epoch": 0.21, "grad_norm": 139.3137954693048, "kl": 12.645751953125, "learning_rate": 1.9271838545667876e-05, "loss": 0.6655, "reward": 1.978515625, "reward_std": 0.4530119374394417, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 766.5234375, "epoch": 0.2104, "grad_norm": 0.3586272792488043, "kl": 0.186767578125, "learning_rate": 1.9266599014641724e-05, "loss": 0.0557, "reward": 1.875, "reward_std": 0.31757519394159317, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.953125, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 683.46875, "epoch": 0.2108, "grad_norm": 0.25710241269113476, "kl": 0.1650390625, "learning_rate": 1.9261341417906622e-05, "loss": 0.0171, "reward": 2.072265625, "reward_std": 0.20316219329833984, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 562.328125, "epoch": 0.2112, "grad_norm": 0.3580538936712411, "kl": 0.173583984375, "learning_rate": 1.9256065765712524e-05, "loss": 0.0285, "reward": 2.109375, "reward_std": 0.2218507118523121, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 637.953125, "epoch": 0.2116, "grad_norm": 0.34339145091184475, "kl": 0.164306640625, "learning_rate": 1.925077206834458e-05, "loss": 0.0433, "reward": 1.962890625, "reward_std": 0.2419046312570572, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.978515625, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 640.3828125, "epoch": 0.212, "grad_norm": 0.4821085042751328, "kl": 0.19482421875, "learning_rate": 1.9245460336123136e-05, "loss": 0.1117, "reward": 2.095703125, "reward_std": 0.4145188182592392, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.939453125, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 672.8046875, "epoch": 0.2124, "grad_norm": 0.8733472142106996, "kl": 0.24951171875, "learning_rate": 1.924013057940367e-05, "loss": 0.0693, "reward": 1.89453125, "reward_std": 0.2728857584297657, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.94921875, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 585.4453125, "epoch": 0.2128, "grad_norm": 0.8752106577284893, "kl": 0.2041015625, "learning_rate": 1.9234782808576823e-05, "loss": 0.058, "reward": 1.98828125, "reward_std": 0.38380974903702736, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.95703125, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 626.640625, "epoch": 0.2132, "grad_norm": 0.4673826495814401, "kl": 0.26171875, "learning_rate": 1.9229417034068352e-05, "loss": 0.1303, "reward": 1.9140625, "reward_std": 0.4001839905977249, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9375, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 612.6953125, "epoch": 0.2136, "grad_norm": 6.948164874297646, "kl": 0.61474609375, "learning_rate": 1.9224033266339103e-05, "loss": 0.1186, "reward": 1.9453125, "reward_std": 0.362044520676136, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 635.2421875, "epoch": 0.214, "grad_norm": 1.930107511753565, "kl": 0.271240234375, "learning_rate": 1.9218631515885007e-05, "loss": 0.1082, "reward": 1.919921875, "reward_std": 0.31253719329833984, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 695.828125, "epoch": 0.2144, "grad_norm": 11.515197401413367, "kl": 2.890625, "learning_rate": 1.9213211793237056e-05, "loss": 0.2184, "reward": 1.703125, "reward_std": 0.31843262910842896, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.875, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 612.0859375, "epoch": 0.2148, "grad_norm": 1.3638702243115526, "kl": 0.406494140625, "learning_rate": 1.9207774108961273e-05, "loss": 0.1925, "reward": 2.00390625, "reward_std": 0.46235188841819763, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 616.0625, "epoch": 0.2152, "grad_norm": 0.7980906732687102, "kl": 0.25146484375, "learning_rate": 1.9202318473658707e-05, "loss": 0.1349, "reward": 2.029296875, "reward_std": 0.34652257710695267, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.958984375, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 650.421875, "epoch": 0.2156, "grad_norm": 0.8306011014595367, "kl": 0.1837158203125, "learning_rate": 1.9196844897965393e-05, "loss": 0.0734, "reward": 1.931640625, "reward_std": 0.25959374010562897, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.900390625, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 638.0078125, "epoch": 0.216, "grad_norm": 0.6370126951315744, "kl": 0.178955078125, "learning_rate": 1.9191353392552346e-05, "loss": 0.0606, "reward": 1.9296875, "reward_std": 0.29746808111667633, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96875, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 710.4296875, "epoch": 0.2164, "grad_norm": 0.48308262432082166, "kl": 0.141845703125, "learning_rate": 1.9185843968125543e-05, "loss": 0.0412, "reward": 1.962890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 633.046875, "epoch": 0.2168, "grad_norm": 0.9578320784798334, "kl": 0.24462890625, "learning_rate": 1.9180316635425883e-05, "loss": 0.1517, "reward": 1.91796875, "reward_std": 0.4501924589276314, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 709.1328125, "epoch": 0.2172, "grad_norm": 3.0920877363512216, "kl": 0.537353515625, "learning_rate": 1.9174771405229187e-05, "loss": 0.1344, "reward": 2.138671875, "reward_std": 0.5179375112056732, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.943359375, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 673.671875, "epoch": 0.2176, "grad_norm": 1.5597722447766769, "kl": 0.326171875, "learning_rate": 1.9169208288346168e-05, "loss": 0.1737, "reward": 1.84765625, "reward_std": 0.4477302134037018, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 704.921875, "epoch": 0.218, "grad_norm": 0.6343355305863279, "kl": 0.20751953125, "learning_rate": 1.9163627295622397e-05, "loss": 0.0901, "reward": 1.912109375, "reward_std": 0.3657132238149643, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.966796875, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 699.0625, "epoch": 0.2184, "grad_norm": 1.2874634215947283, "kl": 0.2294921875, "learning_rate": 1.9158028437938316e-05, "loss": 0.0486, "reward": 1.943359375, "reward_std": 0.3243613988161087, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.951171875, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 669.21875, "epoch": 0.2188, "grad_norm": 0.9466755984692281, "kl": 0.230712890625, "learning_rate": 1.9152411726209176e-05, "loss": 0.1389, "reward": 1.93359375, "reward_std": 0.40226559340953827, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.93359375, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 753.4296875, "epoch": 0.2192, "grad_norm": 0.6677747613671717, "kl": 0.2470703125, "learning_rate": 1.914677717138505e-05, "loss": 0.0601, "reward": 1.7890625, "reward_std": 0.4123437851667404, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.9140625, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 700.6171875, "epoch": 0.2196, "grad_norm": 0.3448323091513235, "kl": 0.155029296875, "learning_rate": 1.914112478445079e-05, "loss": 0.0674, "reward": 1.94921875, "reward_std": 0.3697236105799675, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.96484375, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 761.1171875, "epoch": 0.22, "grad_norm": 0.4601683344005156, "kl": 0.218994140625, "learning_rate": 1.913545457642601e-05, "loss": 0.1402, "reward": 1.73828125, "reward_std": 0.4889906644821167, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.89453125, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 769.0546875, "epoch": 0.2204, "grad_norm": 1.9914734516766972, "kl": 0.412109375, "learning_rate": 1.9129766558365076e-05, "loss": 0.1026, "reward": 1.76953125, "reward_std": 0.4602157697081566, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.88671875, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 552.609375, "epoch": 0.2208, "grad_norm": 0.5480663140774137, "kl": 0.203369140625, "learning_rate": 1.9124060741357065e-05, "loss": 0.0798, "reward": 2.00390625, "reward_std": 0.2627882584929466, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 602.765625, "epoch": 0.2212, "grad_norm": 0.6951326809338657, "kl": 0.19873046875, "learning_rate": 1.911833713652576e-05, "loss": 0.0775, "reward": 1.98828125, "reward_std": 0.21658401563763618, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 599.09375, "epoch": 0.2216, "grad_norm": 0.279415207073284, "kl": 0.202880859375, "learning_rate": 1.9112595755029625e-05, "loss": 0.0721, "reward": 1.958984375, "reward_std": 0.12940485030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 606.4453125, "epoch": 0.222, "grad_norm": 0.8155198344462113, "kl": 0.326416015625, "learning_rate": 1.910683660806177e-05, "loss": 0.065, "reward": 1.892578125, "reward_std": 0.2846650779247284, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 628.0, "epoch": 0.2224, "grad_norm": 0.6250410829826218, "kl": 0.324462890625, "learning_rate": 1.9101059706849957e-05, "loss": 0.06, "reward": 2.134765625, "reward_std": 0.25771110504865646, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.923828125, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 689.546875, "epoch": 0.2228, "grad_norm": 0.3796175075559453, "kl": 0.241455078125, "learning_rate": 1.9095265062656546e-05, "loss": 0.0674, "reward": 1.92578125, "reward_std": 0.3526701405644417, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.94140625, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 672.4140625, "epoch": 0.2232, "grad_norm": 0.6408723462785393, "kl": 0.246826171875, "learning_rate": 1.908945268677849e-05, "loss": 0.0595, "reward": 1.916015625, "reward_std": 0.24040910601615906, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 624.1484375, "epoch": 0.2236, "grad_norm": 0.27559491865511104, "kl": 0.15576171875, "learning_rate": 1.9083622590547313e-05, "loss": -0.0016, "reward": 2.185546875, "reward_std": 0.10310593992471695, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 639.4921875, "epoch": 0.224, "grad_norm": 0.4748735342265291, "kl": 0.178466796875, "learning_rate": 1.907777478532909e-05, "loss": 0.0721, "reward": 2.083984375, "reward_std": 0.37781914323568344, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.958984375, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 643.5234375, "epoch": 0.2244, "grad_norm": 0.9928235937989398, "kl": 0.353271484375, "learning_rate": 1.907190928252441e-05, "loss": 0.2088, "reward": 1.986328125, "reward_std": 0.4974079951643944, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.931640625, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 703.0859375, "epoch": 0.2248, "grad_norm": 1.025357003635867, "kl": 0.38623046875, "learning_rate": 1.906602609356838e-05, "loss": 0.0984, "reward": 2.0390625, "reward_std": 0.4633580818772316, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 664.1171875, "epoch": 0.2252, "grad_norm": 1.234024961644629, "kl": 0.55859375, "learning_rate": 1.9060125229930572e-05, "loss": 0.2105, "reward": 1.689453125, "reward_std": 0.5530258789658546, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.869140625, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 689.84375, "epoch": 0.2256, "grad_norm": 1.1128736227697884, "kl": 0.53759765625, "learning_rate": 1.905420670311502e-05, "loss": 0.1438, "reward": 1.83203125, "reward_std": 0.6068924739956856, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.83984375, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 619.8046875, "epoch": 0.226, "grad_norm": 1.1452086930848044, "kl": 0.3623046875, "learning_rate": 1.9048270524660197e-05, "loss": 0.1661, "reward": 1.76171875, "reward_std": 0.4194094240665436, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.90234375, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 683.6484375, "epoch": 0.2264, "grad_norm": 1.1051625069985225, "kl": 0.34130859375, "learning_rate": 1.9042316706138987e-05, "loss": 0.1978, "reward": 1.76953125, "reward_std": 0.5088462755084038, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.796875, "rewards/tag_count_reward": 0.84765625, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 629.125, "epoch": 0.2268, "grad_norm": 1.3347295670292165, "kl": 0.36865234375, "learning_rate": 1.9036345259158667e-05, "loss": 0.2299, "reward": 1.9375, "reward_std": 0.5584783107042313, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.9140625, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 626.2109375, "epoch": 0.2272, "grad_norm": 1.0617912177557416, "kl": 0.24169921875, "learning_rate": 1.9030356195360875e-05, "loss": 0.2422, "reward": 1.982421875, "reward_std": 0.4430655613541603, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.935546875, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 632.6796875, "epoch": 0.2276, "grad_norm": 0.4338461318272375, "kl": 0.20263671875, "learning_rate": 1.9024349526421596e-05, "loss": 0.1495, "reward": 1.98046875, "reward_std": 0.41335587203502655, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.91796875, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 650.46875, "epoch": 0.228, "grad_norm": 0.2726848745361465, "kl": 0.200439453125, "learning_rate": 1.901832526405114e-05, "loss": 0.0692, "reward": 2.078125, "reward_std": 0.37853705883026123, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 681.609375, "epoch": 0.2284, "grad_norm": 2.8488695771013774, "kl": 0.322265625, "learning_rate": 1.9012283419994115e-05, "loss": 0.174, "reward": 1.8515625, "reward_std": 0.5745564997196198, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.90625, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 644.4296875, "epoch": 0.2288, "grad_norm": 0.696184894248672, "kl": 0.191162109375, "learning_rate": 1.9006224006029404e-05, "loss": 0.1174, "reward": 1.921875, "reward_std": 0.39987488090991974, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 700.1484375, "epoch": 0.2292, "grad_norm": 0.2653971605490904, "kl": 0.190673828125, "learning_rate": 1.9000147033970148e-05, "loss": 0.0709, "reward": 1.91015625, "reward_std": 0.2848687246441841, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.96484375, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 702.8515625, "epoch": 0.2296, "grad_norm": 0.27949360053343614, "kl": 0.180908203125, "learning_rate": 1.899405251566371e-05, "loss": 0.0447, "reward": 2.08203125, "reward_std": 0.2618129998445511, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.98046875, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 680.4609375, "epoch": 0.23, "grad_norm": 0.22982752707580859, "kl": 0.1585693359375, "learning_rate": 1.8987940462991673e-05, "loss": 0.0175, "reward": 2.033203125, "reward_std": 0.15328482538461685, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 604.9609375, "epoch": 0.2304, "grad_norm": 0.2153225248863232, "kl": 0.164794921875, "learning_rate": 1.8981810887869784e-05, "loss": 0.048, "reward": 1.98046875, "reward_std": 0.10596735030412674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 742.859375, "epoch": 0.2308, "grad_norm": 0.2745356783724472, "kl": 0.1767578125, "learning_rate": 1.8975663802247978e-05, "loss": 0.0421, "reward": 1.96875, "reward_std": 0.1811506412923336, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 721.6953125, "epoch": 0.2312, "grad_norm": 0.3348660835157033, "kl": 0.1767578125, "learning_rate": 1.8969499218110302e-05, "loss": 0.0193, "reward": 2.015625, "reward_std": 0.24648857861757278, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 674.1953125, "epoch": 0.2316, "grad_norm": 0.40212791749280014, "kl": 0.177978515625, "learning_rate": 1.896331714747493e-05, "loss": 0.0923, "reward": 2.134765625, "reward_std": 0.3576286733150482, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.970703125, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 709.78125, "epoch": 0.232, "grad_norm": 0.3275948574450762, "kl": 0.169921875, "learning_rate": 1.895711760239413e-05, "loss": 0.0727, "reward": 1.935546875, "reward_std": 0.22810593992471695, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 748.4375, "epoch": 0.2324, "grad_norm": 38.98974108375022, "kl": 1.424560546875, "learning_rate": 1.8950900594954226e-05, "loss": 0.0976, "reward": 1.935546875, "reward_std": 0.15323300659656525, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 749.4296875, "epoch": 0.2328, "grad_norm": 0.6120853167051886, "kl": 0.195556640625, "learning_rate": 1.89446661372756e-05, "loss": 0.111, "reward": 1.8125, "reward_std": 0.49640706926584244, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.921875, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 695.5546875, "epoch": 0.2332, "grad_norm": 0.3970167657690221, "kl": 0.1728515625, "learning_rate": 1.893841424151264e-05, "loss": 0.0626, "reward": 1.880859375, "reward_std": 0.5820420831441879, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.896484375, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 701.59375, "epoch": 0.2336, "grad_norm": 0.3959308845412244, "kl": 0.187255859375, "learning_rate": 1.893214491985374e-05, "loss": 0.0601, "reward": 1.48046875, "reward_std": 0.7062309980392456, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.78515625, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 598.203125, "epoch": 0.234, "grad_norm": 0.29611817620533176, "kl": 0.19287109375, "learning_rate": 1.892585818452126e-05, "loss": 0.1023, "reward": 1.962890625, "reward_std": 0.338131844997406, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 657.9765625, "epoch": 0.2344, "grad_norm": 0.39031151977058887, "kl": 0.216796875, "learning_rate": 1.8919554047771508e-05, "loss": 0.0355, "reward": 2.072265625, "reward_std": 0.3005325198173523, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 545.4453125, "epoch": 0.2348, "grad_norm": 0.278858010176991, "kl": 0.21337890625, "learning_rate": 1.8913232521894734e-05, "loss": 0.0832, "reward": 1.986328125, "reward_std": 0.21924476325511932, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.986328125, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 544.3671875, "epoch": 0.2352, "grad_norm": 0.6769410392957795, "kl": 0.281005859375, "learning_rate": 1.890689361921507e-05, "loss": 0.129, "reward": 1.87890625, "reward_std": 0.3433684855699539, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94921875, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 638.03125, "epoch": 0.2356, "grad_norm": 0.6270483702312183, "kl": 0.263671875, "learning_rate": 1.8900537352090523e-05, "loss": 0.1168, "reward": 2.275390625, "reward_std": 0.5123427882790565, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.923828125, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 704.9140625, "epoch": 0.236, "grad_norm": 7.939013218505528, "kl": 0.8505859375, "learning_rate": 1.889416373291298e-05, "loss": 0.133, "reward": 1.8984375, "reward_std": 0.20830653980374336, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.90625, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 868.7265625, "epoch": 0.2364, "grad_norm": 11.706701409795397, "kl": 0.8115234375, "learning_rate": 1.8887772774108116e-05, "loss": 0.0972, "reward": 1.8203125, "reward_std": 0.32899145781993866, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.703125, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 981.8984375, "epoch": 0.2368, "grad_norm": 0.46382111758311917, "kl": 0.39306640625, "learning_rate": 1.8881364488135448e-05, "loss": 0.0214, "reward": 1.59765625, "reward_std": 0.33214208483695984, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.59765625, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 885.6796875, "epoch": 0.2372, "grad_norm": 0.3849566139521708, "kl": 0.291015625, "learning_rate": 1.887493888748825e-05, "loss": 0.0476, "reward": 1.673828125, "reward_std": 0.34563909471035004, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.689453125, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 771.3671875, "epoch": 0.2376, "grad_norm": 0.44290940134444967, "kl": 0.25244140625, "learning_rate": 1.886849598469356e-05, "loss": 0.0676, "reward": 1.833984375, "reward_std": 0.3980320170521736, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.771484375, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 750.9921875, "epoch": 0.238, "grad_norm": 0.418413911839283, "kl": 0.236328125, "learning_rate": 1.8862035792312148e-05, "loss": 0.0554, "reward": 1.93359375, "reward_std": 0.37462379038333893, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.84765625, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 714.3359375, "epoch": 0.2384, "grad_norm": 0.4727168810611607, "kl": 0.2236328125, "learning_rate": 1.8855558322938492e-05, "loss": 0.0252, "reward": 1.87109375, "reward_std": 0.32723578438162804, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.93359375, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 739.8515625, "epoch": 0.2388, "grad_norm": 0.3917428540035429, "kl": 0.217041015625, "learning_rate": 1.8849063589200744e-05, "loss": 0.0452, "reward": 1.88671875, "reward_std": 0.37136025726795197, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.94921875, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 763.4609375, "epoch": 0.2392, "grad_norm": 0.2796499677491046, "kl": 0.224365234375, "learning_rate": 1.8842551603760725e-05, "loss": 0.0369, "reward": 2.0625, "reward_std": 0.2321586236357689, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 717.8515625, "epoch": 0.2396, "grad_norm": 0.3696485413462533, "kl": 0.212158203125, "learning_rate": 1.8836022379313884e-05, "loss": 0.0503, "reward": 1.986328125, "reward_std": 0.4969033971428871, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.955078125, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 682.5, "epoch": 0.24, "grad_norm": 0.4353222123593179, "kl": 0.22021484375, "learning_rate": 1.8829475928589272e-05, "loss": 0.0189, "reward": 1.822265625, "reward_std": 0.4316793829202652, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.923828125, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 675.2421875, "epoch": 0.2404, "grad_norm": 0.45575426167949507, "kl": 0.235595703125, "learning_rate": 1.8822912264349535e-05, "loss": 0.0347, "reward": 1.9296875, "reward_std": 0.6106016635894775, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.90625, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 717.375, "epoch": 0.2408, "grad_norm": 28.91830577235967, "kl": 2.69677734375, "learning_rate": 1.881633139939087e-05, "loss": 0.2147, "reward": 1.8359375, "reward_std": 0.47168339788913727, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.9296875, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 598.125, "epoch": 0.2412, "grad_norm": 0.36427750370712253, "kl": 0.234619140625, "learning_rate": 1.8809733346543013e-05, "loss": 0.0861, "reward": 1.9140625, "reward_std": 0.2883383557200432, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 585.6171875, "epoch": 0.2416, "grad_norm": 1.1321930793809694, "kl": 0.301025390625, "learning_rate": 1.8803118118669203e-05, "loss": 0.1037, "reward": 2.111328125, "reward_std": 0.3671388328075409, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 593.1875, "epoch": 0.242, "grad_norm": 0.24422801861604476, "kl": 0.22607421875, "learning_rate": 1.879648572866617e-05, "loss": 0.0452, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 569.4140625, "epoch": 0.2424, "grad_norm": 0.3047653096793033, "kl": 0.2421875, "learning_rate": 1.878983618946409e-05, "loss": 0.0369, "reward": 2.10546875, "reward_std": 0.10596735030412674, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 533.2890625, "epoch": 0.2428, "grad_norm": 0.32822999714458834, "kl": 0.21240234375, "learning_rate": 1.878316951402658e-05, "loss": 0.0191, "reward": 2.068359375, "reward_std": 0.19265169650316238, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 564.2109375, "epoch": 0.2432, "grad_norm": 9.523473152744748, "kl": 0.2294921875, "learning_rate": 1.8776485715350672e-05, "loss": 0.0597, "reward": 1.9921875, "reward_std": 0.2378891110420227, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 561.3828125, "epoch": 0.2436, "grad_norm": 1375.9322265005967, "kl": 0.84716796875, "learning_rate": 1.8769784806466768e-05, "loss": 0.0856, "reward": 2.15625, "reward_std": 0.19628482311964035, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 522.34375, "epoch": 0.244, "grad_norm": 227.59476337386752, "kl": 0.35400390625, "learning_rate": 1.8763066800438638e-05, "loss": 0.1288, "reward": 2.1171875, "reward_std": 0.24659235030412674, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 585.265625, "epoch": 0.2444, "grad_norm": 0.168850706325144, "kl": 0.24072265625, "learning_rate": 1.8756331710363375e-05, "loss": 0.0128, "reward": 2.0078125, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 479.203125, "epoch": 0.2448, "grad_norm": 0.2842800777993855, "kl": 0.257568359375, "learning_rate": 1.874957954937138e-05, "loss": 0.0042, "reward": 2.048828125, "reward_std": 0.11923722177743912, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 551.40625, "epoch": 0.2452, "grad_norm": 0.31307842766573507, "kl": 0.23486328125, "learning_rate": 1.8742810330626338e-05, "loss": 0.0158, "reward": 2.111328125, "reward_std": 0.18666068464517593, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 555.2890625, "epoch": 0.2456, "grad_norm": 0.8921176887963728, "kl": 0.2333984375, "learning_rate": 1.8736024067325188e-05, "loss": 0.0307, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 576.8359375, "epoch": 0.246, "grad_norm": 1.4261959693290138, "kl": 0.22119140625, "learning_rate": 1.8729220772698096e-05, "loss": 0.0434, "reward": 1.994140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 647.8125, "epoch": 0.2464, "grad_norm": 8.156864269342595, "kl": 0.20654296875, "learning_rate": 1.8722400460008437e-05, "loss": 0.0224, "reward": 2.2265625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 563.5703125, "epoch": 0.2468, "grad_norm": 36.31338975231153, "kl": 0.223876953125, "learning_rate": 1.8715563142552758e-05, "loss": 0.0541, "reward": 2.056640625, "reward_std": 0.16142656654119492, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 547.5703125, "epoch": 0.2472, "grad_norm": 375.72267797142007, "kl": 0.398681640625, "learning_rate": 1.8708708833660755e-05, "loss": 0.0234, "reward": 2.0859375, "reward_std": 0.19234732538461685, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 579.4140625, "epoch": 0.2476, "grad_norm": 0.3337836971444256, "kl": 0.220703125, "learning_rate": 1.870183754669526e-05, "loss": 0.0086, "reward": 2.0078125, "reward_std": 0.15799926593899727, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 606.4375, "epoch": 0.248, "grad_norm": 147.41121245574354, "kl": 0.33447265625, "learning_rate": 1.869494929505219e-05, "loss": 0.0649, "reward": 1.95703125, "reward_std": 0.19971734285354614, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 650.625, "epoch": 0.2484, "grad_norm": 0.2509444372236024, "kl": 0.217041015625, "learning_rate": 1.8688044092160554e-05, "loss": 0.011, "reward": 2.1171875, "reward_std": 0.1060761883854866, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 678.59375, "epoch": 0.2488, "grad_norm": 0.6813984778125906, "kl": 0.18212890625, "learning_rate": 1.8681121951482397e-05, "loss": 0.0661, "reward": 2.05078125, "reward_std": 0.22481617331504822, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 625.21875, "epoch": 0.2492, "grad_norm": 0.27371741460231785, "kl": 0.2216796875, "learning_rate": 1.8674182886512776e-05, "loss": 0.0315, "reward": 1.82421875, "reward_std": 0.17205528914928436, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.94921875, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 695.84375, "epoch": 0.2496, "grad_norm": 0.6144407965592209, "kl": 0.21484375, "learning_rate": 1.8667226910779767e-05, "loss": 0.0647, "reward": 1.96875, "reward_std": 0.29269562661647797, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 641.265625, "epoch": 0.25, "grad_norm": 0.15942958814271652, "kl": 0.19580078125, "learning_rate": 1.866025403784439e-05, "loss": 0.0093, "reward": 2.0078125, "reward_std": 0.03125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 663.203125, "epoch": 0.2504, "grad_norm": 0.21715271114062726, "kl": 0.199462890625, "learning_rate": 1.8653264281300622e-05, "loss": 0.0473, "reward": 1.98046875, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 669.6875, "epoch": 0.2508, "grad_norm": 0.24489724030245788, "kl": 0.18115234375, "learning_rate": 1.864625765477535e-05, "loss": 0.0632, "reward": 2.095703125, "reward_std": 0.2842430993914604, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.970703125, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 654.7734375, "epoch": 0.2512, "grad_norm": 0.2800150883755788, "kl": 0.17822265625, "learning_rate": 1.8639234171928355e-05, "loss": 0.0557, "reward": 2.171875, "reward_std": 0.15779343992471695, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 652.1875, "epoch": 0.2516, "grad_norm": 0.16036293239703614, "kl": 0.178955078125, "learning_rate": 1.863219384645227e-05, "loss": 0.0137, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 666.8046875, "epoch": 0.252, "grad_norm": 0.3003028698899826, "kl": 0.175537109375, "learning_rate": 1.8625136692072577e-05, "loss": 0.059, "reward": 1.998046875, "reward_std": 0.21604233980178833, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 628.3671875, "epoch": 0.2524, "grad_norm": 189.75474803171088, "kl": 38.884765625, "learning_rate": 1.861806272254755e-05, "loss": 1.8263, "reward": 1.9921875, "reward_std": 0.20536844432353973, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 674.265625, "epoch": 0.2528, "grad_norm": 0.21978979887348749, "kl": 0.1875, "learning_rate": 1.8610971951668265e-05, "loss": 0.0364, "reward": 1.970703125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 688.4453125, "epoch": 0.2532, "grad_norm": 0.20979584449581604, "kl": 0.183837890625, "learning_rate": 1.8603864393258534e-05, "loss": 0.0768, "reward": 2.041015625, "reward_std": 0.2522806338965893, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 643.3828125, "epoch": 0.2536, "grad_norm": 3.500646341220676, "kl": 0.302490234375, "learning_rate": 1.8596740061174912e-05, "loss": 0.0309, "reward": 2.052734375, "reward_std": 0.2672552466392517, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 634.453125, "epoch": 0.254, "grad_norm": 0.29118544658045403, "kl": 0.190673828125, "learning_rate": 1.8589598969306646e-05, "loss": 0.0391, "reward": 2.146484375, "reward_std": 0.34007685631513596, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 707.828125, "epoch": 0.2544, "grad_norm": 0.175772846837893, "kl": 0.1748046875, "learning_rate": 1.8582441131575658e-05, "loss": 0.0318, "reward": 2.00390625, "reward_std": 0.16527669876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 680.765625, "epoch": 0.2548, "grad_norm": 0.2568337309856518, "kl": 0.172119140625, "learning_rate": 1.8575266561936526e-05, "loss": 0.042, "reward": 2.005859375, "reward_std": 0.19265169650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 597.4140625, "epoch": 0.2552, "grad_norm": 0.33432806037586515, "kl": 0.158203125, "learning_rate": 1.856807527437643e-05, "loss": 0.0757, "reward": 2.37890625, "reward_std": 0.36026807129383087, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 649.2734375, "epoch": 0.2556, "grad_norm": 0.28711110867090933, "kl": 0.172607421875, "learning_rate": 1.8560867282915164e-05, "loss": 0.0073, "reward": 2.00390625, "reward_std": 0.1717662587761879, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 760.921875, "epoch": 0.256, "grad_norm": 0.6837255983314494, "kl": 0.16748046875, "learning_rate": 1.855364260160507e-05, "loss": 0.0422, "reward": 1.88671875, "reward_std": 0.34489792585372925, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.95703125, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 751.8984375, "epoch": 0.2564, "grad_norm": 0.6421803453227095, "kl": 0.18017578125, "learning_rate": 1.854640124453103e-05, "loss": 0.0651, "reward": 2.056640625, "reward_std": 0.3746442273259163, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 658.65625, "epoch": 0.2568, "grad_norm": 0.2600625837185863, "kl": 0.149169921875, "learning_rate": 1.8539143225810453e-05, "loss": 0.0602, "reward": 1.947265625, "reward_std": 0.21545885503292084, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 701.625, "epoch": 0.2572, "grad_norm": 0.24815292650410908, "kl": 0.16845703125, "learning_rate": 1.8531868559593205e-05, "loss": 0.0269, "reward": 1.982421875, "reward_std": 0.17055703699588776, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 714.6953125, "epoch": 0.2576, "grad_norm": 0.5714382785021838, "kl": 0.189697265625, "learning_rate": 1.8524577260061628e-05, "loss": 0.028, "reward": 2.142578125, "reward_std": 0.24130193144083023, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 599.171875, "epoch": 0.258, "grad_norm": 0.30177673633180513, "kl": 0.15771484375, "learning_rate": 1.851726934143048e-05, "loss": 0.0327, "reward": 2.48828125, "reward_std": 0.29586296528577805, "rewards/accuracy_reward": 0.625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 707.8203125, "epoch": 0.2584, "grad_norm": 0.20025788088417495, "kl": 0.149658203125, "learning_rate": 1.850994481794692e-05, "loss": 0.0569, "reward": 1.984375, "reward_std": 0.2579732611775398, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 663.484375, "epoch": 0.2588, "grad_norm": 0.3548444440169146, "kl": 0.1806640625, "learning_rate": 1.8502603703890488e-05, "loss": 0.0512, "reward": 2.0546875, "reward_std": 0.37220466136932373, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 636.5625, "epoch": 0.2592, "grad_norm": 0.24378879845455698, "kl": 0.162109375, "learning_rate": 1.8495246013573057e-05, "loss": 0.0562, "reward": 2.158203125, "reward_std": 0.2712289020419121, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 713.453125, "epoch": 0.2596, "grad_norm": 0.2134588976555159, "kl": 0.16357421875, "learning_rate": 1.848787176133882e-05, "loss": 0.0321, "reward": 2.12109375, "reward_std": 0.1597641110420227, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 718.6484375, "epoch": 0.26, "grad_norm": 0.2580886782062574, "kl": 0.17236328125, "learning_rate": 1.848048096156426e-05, "loss": 0.0832, "reward": 2.10546875, "reward_std": 0.39385079592466354, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 777.1875, "epoch": 0.2604, "grad_norm": 0.26710287253308984, "kl": 0.137451171875, "learning_rate": 1.8473073628658123e-05, "loss": 0.0383, "reward": 2.021484375, "reward_std": 0.38048408180475235, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.958984375, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 766.09375, "epoch": 0.2608, "grad_norm": 0.25647122641778736, "kl": 0.164306640625, "learning_rate": 1.8465649777061377e-05, "loss": 0.0684, "reward": 1.876953125, "reward_std": 0.35355690121650696, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 742.4140625, "epoch": 0.2612, "grad_norm": 0.24016222696581527, "kl": 0.1353759765625, "learning_rate": 1.8458209421247208e-05, "loss": 0.051, "reward": 1.84765625, "reward_std": 0.37856457382440567, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 726.296875, "epoch": 0.2616, "grad_norm": 0.2826009192797722, "kl": 0.149658203125, "learning_rate": 1.8450752575720967e-05, "loss": 0.1066, "reward": 1.861328125, "reward_std": 0.4930327981710434, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.931640625, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 640.3515625, "epoch": 0.262, "grad_norm": 0.2708586762374592, "kl": 0.1611328125, "learning_rate": 1.8443279255020153e-05, "loss": 0.062, "reward": 1.9375, "reward_std": 0.3347940221428871, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 688.6015625, "epoch": 0.2624, "grad_norm": 0.24595654886203877, "kl": 0.170166015625, "learning_rate": 1.843578947371439e-05, "loss": 0.0853, "reward": 1.8515625, "reward_std": 0.3765430226922035, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9375, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 642.8515625, "epoch": 0.2628, "grad_norm": 0.681355673511502, "kl": 0.208740234375, "learning_rate": 1.842828324640539e-05, "loss": 0.1256, "reward": 1.904296875, "reward_std": 0.47841688990592957, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 758.234375, "epoch": 0.2632, "grad_norm": 0.30377352082921943, "kl": 0.166015625, "learning_rate": 1.8420760587726925e-05, "loss": 0.0747, "reward": 1.93359375, "reward_std": 0.46467770636081696, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.90234375, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 686.9375, "epoch": 0.2636, "grad_norm": 0.4782591072658645, "kl": 0.17822265625, "learning_rate": 1.8413221512344805e-05, "loss": 0.1312, "reward": 1.794921875, "reward_std": 0.4563656523823738, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.912109375, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 732.03125, "epoch": 0.264, "grad_norm": 0.29941659545073007, "kl": 0.1494140625, "learning_rate": 1.8405666034956842e-05, "loss": 0.1803, "reward": 1.828125, "reward_std": 0.6068588122725487, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.875, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 651.3984375, "epoch": 0.2644, "grad_norm": 0.24021580168526527, "kl": 0.1513671875, "learning_rate": 1.839809417029283e-05, "loss": 0.114, "reward": 2.126953125, "reward_std": 0.33660782128572464, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 669.8046875, "epoch": 0.2648, "grad_norm": 0.2826725647579659, "kl": 0.162841796875, "learning_rate": 1.8390505933114503e-05, "loss": 0.0825, "reward": 1.912109375, "reward_std": 0.3852238655090332, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 767.734375, "epoch": 0.2652, "grad_norm": 0.42549674110903757, "kl": 0.15576171875, "learning_rate": 1.8382901338215515e-05, "loss": 0.1754, "reward": 1.400390625, "reward_std": 0.6323322206735611, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.6484375, "rewards/tag_count_reward": 0.744140625, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 746.1875, "epoch": 0.2656, "grad_norm": 0.35922249090046326, "kl": 0.149169921875, "learning_rate": 1.837528040042142e-05, "loss": 0.1189, "reward": 1.634765625, "reward_std": 0.46663394942879677, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.814453125, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 675.6328125, "epoch": 0.266, "grad_norm": 0.4936940552366552, "kl": 0.1673583984375, "learning_rate": 1.836764313458962e-05, "loss": 0.1804, "reward": 1.876953125, "reward_std": 0.6029605120420456, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.900390625, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 767.1796875, "epoch": 0.2664, "grad_norm": 0.3131558831045285, "kl": 0.155029296875, "learning_rate": 1.8359989555609355e-05, "loss": 0.1215, "reward": 1.685546875, "reward_std": 0.5456719622015953, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.865234375, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 650.296875, "epoch": 0.2668, "grad_norm": 0.2590797793762031, "kl": 0.144775390625, "learning_rate": 1.8352319678401677e-05, "loss": 0.0799, "reward": 1.974609375, "reward_std": 0.31496891379356384, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 698.3671875, "epoch": 0.2672, "grad_norm": 0.31843475259317133, "kl": 0.1640625, "learning_rate": 1.834463351791939e-05, "loss": 0.0776, "reward": 1.986328125, "reward_std": 0.31287000328302383, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 694.9140625, "epoch": 0.2676, "grad_norm": 0.5072297285924585, "kl": 0.177978515625, "learning_rate": 1.8336931089147076e-05, "loss": 0.1694, "reward": 1.9375, "reward_std": 0.6569493114948273, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.90625, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 676.2890625, "epoch": 0.268, "grad_norm": 1.1391113837864286, "kl": 0.152587890625, "learning_rate": 1.8329212407100996e-05, "loss": 0.1846, "reward": 1.9140625, "reward_std": 0.5625697001814842, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.921875, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 747.4140625, "epoch": 0.2684, "grad_norm": 0.6050591141367241, "kl": 0.16259765625, "learning_rate": 1.8321477486829128e-05, "loss": 0.1685, "reward": 1.451171875, "reward_std": 0.6326377764344215, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.6640625, "rewards/tag_count_reward": 0.748046875, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 718.6796875, "epoch": 0.2688, "grad_norm": 0.7859574540063486, "kl": 0.201904296875, "learning_rate": 1.8313726343411085e-05, "loss": 0.2392, "reward": 1.58984375, "reward_std": 0.7998636662960052, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.78515625, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 705.2265625, "epoch": 0.2692, "grad_norm": 1.1682396905438428, "kl": 0.19287109375, "learning_rate": 1.830595899195813e-05, "loss": 0.1874, "reward": 1.775390625, "reward_std": 0.655318908393383, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.853515625, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 702.9921875, "epoch": 0.2696, "grad_norm": 0.6833319491089238, "kl": 0.146484375, "learning_rate": 1.82981754476131e-05, "loss": 0.2042, "reward": 1.814453125, "reward_std": 0.6573643833398819, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.869140625, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 686.5625, "epoch": 0.27, "grad_norm": 8.105416706702881, "kl": 0.7451171875, "learning_rate": 1.8290375725550417e-05, "loss": 0.2388, "reward": 1.634765625, "reward_std": 0.5725494772195816, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.845703125, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 603.6953125, "epoch": 0.2704, "grad_norm": 0.48483445345200477, "kl": 0.188232421875, "learning_rate": 1.8282559840976043e-05, "loss": 0.0904, "reward": 2.048828125, "reward_std": 0.3005402684211731, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 632.515625, "epoch": 0.2708, "grad_norm": 0.9710936411545987, "kl": 0.197998046875, "learning_rate": 1.827472780912744e-05, "loss": 0.2473, "reward": 1.822265625, "reward_std": 0.6295557245612144, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.869140625, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 501.7109375, "epoch": 0.2712, "grad_norm": 0.7021565922324364, "kl": 0.200927734375, "learning_rate": 1.8266879645273557e-05, "loss": 0.1887, "reward": 2.0234375, "reward_std": 0.43325159698724747, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 614.1015625, "epoch": 0.2716, "grad_norm": 0.6330245101046608, "kl": 0.1875, "learning_rate": 1.8259015364714786e-05, "loss": 0.1753, "reward": 1.724609375, "reward_std": 0.4313738942146301, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.880859375, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 553.5859375, "epoch": 0.272, "grad_norm": 0.45302627195430745, "kl": 0.21630859375, "learning_rate": 1.8251134982782952e-05, "loss": 0.1004, "reward": 2.04296875, "reward_std": 0.3152615651488304, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 510.6328125, "epoch": 0.2724, "grad_norm": 0.72490088986225, "kl": 0.214599609375, "learning_rate": 1.824323851484126e-05, "loss": 0.2021, "reward": 1.908203125, "reward_std": 0.5275630205869675, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.916015625, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 527.890625, "epoch": 0.2728, "grad_norm": 1.0104261622921475, "kl": 0.26513671875, "learning_rate": 1.8235325976284276e-05, "loss": 0.3964, "reward": 1.67578125, "reward_std": 0.6086910739541054, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.796875, "rewards/tag_count_reward": 0.84765625, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 501.3125, "epoch": 0.2732, "grad_norm": 0.41436553336494114, "kl": 0.230224609375, "learning_rate": 1.82273973825379e-05, "loss": 0.2316, "reward": 1.912109375, "reward_std": 0.4570281207561493, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.935546875, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 530.9609375, "epoch": 0.2736, "grad_norm": 0.7248574961932972, "kl": 0.26513671875, "learning_rate": 1.8219452749059332e-05, "loss": 0.2388, "reward": 1.724609375, "reward_std": 0.4564047083258629, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.880859375, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 539.21875, "epoch": 0.274, "grad_norm": 1.8554691257420022, "kl": 0.26416015625, "learning_rate": 1.821149209133704e-05, "loss": 0.1646, "reward": 1.951171875, "reward_std": 0.356665700674057, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.919921875, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 557.75, "epoch": 0.2744, "grad_norm": 0.8751312931762738, "kl": 0.27392578125, "learning_rate": 1.8203515424890738e-05, "loss": 0.1867, "reward": 1.83984375, "reward_std": 0.37382933497428894, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 615.0703125, "epoch": 0.2748, "grad_norm": 0.9567184680696277, "kl": 0.3154296875, "learning_rate": 1.819552276527134e-05, "loss": 0.223, "reward": 1.892578125, "reward_std": 0.556139849126339, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.892578125, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 722.125, "epoch": 0.2752, "grad_norm": 1.418608219540666, "kl": 0.265380859375, "learning_rate": 1.8187514128060946e-05, "loss": 0.2263, "reward": 1.580078125, "reward_std": 0.7174660861492157, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.822265625, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 721.125, "epoch": 0.2756, "grad_norm": 0.5384786776168357, "kl": 0.21630859375, "learning_rate": 1.8179489528872808e-05, "loss": 0.2547, "reward": 1.486328125, "reward_std": 0.7130448520183563, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.6953125, "rewards/tag_count_reward": 0.791015625, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 791.3671875, "epoch": 0.276, "grad_norm": 1.5167783746628158, "kl": 0.244140625, "learning_rate": 1.8171448983351284e-05, "loss": 0.2616, "reward": 1.232421875, "reward_std": 0.9044418781995773, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.5078125, "rewards/tag_count_reward": 0.630859375, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 763.8515625, "epoch": 0.2764, "grad_norm": 0.7086316485442661, "kl": 0.218505859375, "learning_rate": 1.816339250717184e-05, "loss": 0.2559, "reward": 1.525390625, "reward_std": 0.9021541923284531, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.6015625, "rewards/tag_count_reward": 0.705078125, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 795.1953125, "epoch": 0.2768, "grad_norm": 1.5516729543641783, "kl": 0.200927734375, "learning_rate": 1.8155320116040983e-05, "loss": 0.2268, "reward": 1.369140625, "reward_std": 0.7473985850811005, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.5234375, "rewards/tag_count_reward": 0.666015625, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 668.1484375, "epoch": 0.2772, "grad_norm": 1.2405239364482854, "kl": 0.2646484375, "learning_rate": 1.814723182569625e-05, "loss": 0.298, "reward": 1.578125, "reward_std": 0.7537873908877373, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.796875, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 693.9453125, "epoch": 0.2776, "grad_norm": 269.9639912522376, "kl": 34.1630859375, "learning_rate": 1.8139127651906183e-05, "loss": 2.8816, "reward": 1.611328125, "reward_std": 0.7647852450609207, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.798828125, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 630.5, "epoch": 0.278, "grad_norm": 0.6506590142730213, "kl": 0.245849609375, "learning_rate": 1.8131007610470278e-05, "loss": 0.2424, "reward": 1.689453125, "reward_std": 0.5354024097323418, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.822265625, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 557.4375, "epoch": 0.2784, "grad_norm": 0.7478823415728656, "kl": 0.236328125, "learning_rate": 1.812287171721897e-05, "loss": 0.1104, "reward": 1.95703125, "reward_std": 0.2670421749353409, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 635.1484375, "epoch": 0.2788, "grad_norm": 0.4712745327993434, "kl": 0.232177734375, "learning_rate": 1.8114719988013612e-05, "loss": 0.0676, "reward": 1.943359375, "reward_std": 0.3098445385694504, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 658.515625, "epoch": 0.2792, "grad_norm": 0.5310576938664473, "kl": 0.246826171875, "learning_rate": 1.81065524387464e-05, "loss": 0.1056, "reward": 1.908203125, "reward_std": 0.2632145509123802, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 614.6484375, "epoch": 0.2796, "grad_norm": 4.324021516328731, "kl": 0.359619140625, "learning_rate": 1.80983690853404e-05, "loss": 0.1438, "reward": 1.908203125, "reward_std": 0.2756398841738701, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.962890625, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 630.5625, "epoch": 0.28, "grad_norm": 0.4497603357204461, "kl": 0.252685546875, "learning_rate": 1.8090169943749477e-05, "loss": 0.0691, "reward": 1.94921875, "reward_std": 0.2767498642206192, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 541.078125, "epoch": 0.2804, "grad_norm": 0.4395993713856212, "kl": 0.23583984375, "learning_rate": 1.8081955029958272e-05, "loss": 0.0997, "reward": 2.0625, "reward_std": 0.362333245575428, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 560.9921875, "epoch": 0.2808, "grad_norm": 2.3407772588754496, "kl": 0.3681640625, "learning_rate": 1.8073724359982184e-05, "loss": 0.1761, "reward": 1.890625, "reward_std": 0.3269607946276665, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 576.828125, "epoch": 0.2812, "grad_norm": 0.33120631757974933, "kl": 0.2451171875, "learning_rate": 1.8065477949867327e-05, "loss": 0.046, "reward": 1.99609375, "reward_std": 0.1486629769206047, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 715.140625, "epoch": 0.2816, "grad_norm": 0.36371710134914337, "kl": 0.235595703125, "learning_rate": 1.8057215815690494e-05, "loss": 0.0884, "reward": 1.85546875, "reward_std": 0.4342746362090111, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.93359375, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 569.34375, "epoch": 0.282, "grad_norm": 0.7498551038757554, "kl": 0.278564453125, "learning_rate": 1.804893797355914e-05, "loss": 0.0749, "reward": 1.935546875, "reward_std": 0.40565526485443115, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.958984375, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 630.078125, "epoch": 0.2824, "grad_norm": 0.34823113078003826, "kl": 0.2470703125, "learning_rate": 1.8040644439611348e-05, "loss": 0.1337, "reward": 2.08984375, "reward_std": 0.30096687376499176, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96484375, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 550.4921875, "epoch": 0.2828, "grad_norm": 0.40590191485700733, "kl": 0.246826171875, "learning_rate": 1.803233523001578e-05, "loss": 0.1784, "reward": 1.974609375, "reward_std": 0.454647958278656, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 599.7890625, "epoch": 0.2832, "grad_norm": 0.3126444542481391, "kl": 0.23779296875, "learning_rate": 1.802401036097167e-05, "loss": 0.1861, "reward": 1.8984375, "reward_std": 0.3429766967892647, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 785.4296875, "epoch": 0.2836, "grad_norm": 0.35480359441983705, "kl": 0.22705078125, "learning_rate": 1.8015669848708768e-05, "loss": 0.0764, "reward": 1.927734375, "reward_std": 0.4845649302005768, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.880859375, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 675.234375, "epoch": 0.284, "grad_norm": 0.9054827429725738, "kl": 0.31640625, "learning_rate": 1.8007313709487334e-05, "loss": 0.1207, "reward": 1.826171875, "reward_std": 0.42606811225414276, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.927734375, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 651.0625, "epoch": 0.2844, "grad_norm": 0.37493208430044334, "kl": 0.4052734375, "learning_rate": 1.7998941959598097e-05, "loss": 0.1762, "reward": 1.83203125, "reward_std": 0.46526191383600235, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.92578125, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 662.6875, "epoch": 0.2848, "grad_norm": 0.443718417951369, "kl": 0.263671875, "learning_rate": 1.79905546153622e-05, "loss": 0.1612, "reward": 1.783203125, "reward_std": 0.5741490721702576, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.900390625, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 611.3828125, "epoch": 0.2852, "grad_norm": 0.3942557800836719, "kl": 0.24951171875, "learning_rate": 1.7982151693131206e-05, "loss": 0.1575, "reward": 2.033203125, "reward_std": 0.4637361988425255, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.916015625, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 730.6875, "epoch": 0.2856, "grad_norm": 0.577495798679289, "kl": 0.28955078125, "learning_rate": 1.7973733209287036e-05, "loss": 0.154, "reward": 1.6953125, "reward_std": 0.6082654520869255, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.8671875, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 637.6015625, "epoch": 0.286, "grad_norm": 0.857988469728571, "kl": 0.2880859375, "learning_rate": 1.7965299180241963e-05, "loss": 0.2128, "reward": 1.923828125, "reward_std": 0.6766286045312881, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.884765625, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 723.8046875, "epoch": 0.2864, "grad_norm": 0.4979356223456465, "kl": 0.26416015625, "learning_rate": 1.7956849622438554e-05, "loss": 0.1372, "reward": 1.609375, "reward_std": 0.8079803884029388, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.8203125, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 695.9140625, "epoch": 0.2868, "grad_norm": 0.40863294209915, "kl": 0.262451171875, "learning_rate": 1.794838455234966e-05, "loss": 0.0888, "reward": 1.02734375, "reward_std": 0.9159408062696457, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.390625, "rewards/tag_count_reward": 0.55078125, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 757.5078125, "epoch": 0.2872, "grad_norm": 3.464176475012491, "kl": 0.94384765625, "learning_rate": 1.7939903986478354e-05, "loss": 0.1684, "reward": 1.005859375, "reward_std": 0.9008960127830505, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.3671875, "rewards/tag_count_reward": 0.498046875, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 755.9296875, "epoch": 0.2876, "grad_norm": 0.5050294178605481, "kl": 0.24560546875, "learning_rate": 1.793140794135795e-05, "loss": 0.1239, "reward": 0.701171875, "reward_std": 0.9287013709545135, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.296875, "rewards/tag_count_reward": 0.357421875, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 739.015625, "epoch": 0.288, "grad_norm": 1.159687293620524, "kl": 0.5947265625, "learning_rate": 1.792289643355191e-05, "loss": 0.1783, "reward": 0.919921875, "reward_std": 0.9904097318649292, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.40625, "rewards/tag_count_reward": 0.458984375, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 740.6015625, "epoch": 0.2884, "grad_norm": 0.4533738239296195, "kl": 0.266357421875, "learning_rate": 1.7914369479653858e-05, "loss": 0.1797, "reward": 1.2265625, "reward_std": 0.973580077290535, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.5, "rewards/tag_count_reward": 0.5546875, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 729.890625, "epoch": 0.2888, "grad_norm": 1.6347744049714417, "kl": 0.303466796875, "learning_rate": 1.7905827096287532e-05, "loss": 0.2071, "reward": 1.462890625, "reward_std": 0.8141616731882095, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.759765625, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 725.21875, "epoch": 0.2892, "grad_norm": 1.677378277654967, "kl": 0.2705078125, "learning_rate": 1.789726930010674e-05, "loss": 0.2258, "reward": 1.541015625, "reward_std": 0.7651144564151764, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.798828125, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 747.578125, "epoch": 0.2896, "grad_norm": 1.1172004109284706, "kl": 0.34423828125, "learning_rate": 1.7888696107795343e-05, "loss": 0.2187, "reward": 1.380859375, "reward_std": 0.7512632980942726, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.625, "rewards/tag_count_reward": 0.740234375, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 642.9375, "epoch": 0.29, "grad_norm": 1.4318867255147625, "kl": 0.3232421875, "learning_rate": 1.788010753606722e-05, "loss": 0.2351, "reward": 1.841796875, "reward_std": 0.6002832502126694, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.912109375, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 602.625, "epoch": 0.2904, "grad_norm": 0.3053123723321889, "kl": 0.259765625, "learning_rate": 1.7871503601666233e-05, "loss": 0.2135, "reward": 1.849609375, "reward_std": 0.5549320876598358, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.904296875, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 634.546875, "epoch": 0.2908, "grad_norm": 0.3785102037743658, "kl": 0.250732421875, "learning_rate": 1.786288432136619e-05, "loss": 0.158, "reward": 1.86328125, "reward_std": 0.36292988806962967, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94921875, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 549.1796875, "epoch": 0.2912, "grad_norm": 0.36227449202984946, "kl": 0.26025390625, "learning_rate": 1.785424971197082e-05, "loss": 0.1904, "reward": 1.9375, "reward_std": 0.4520261734724045, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.953125, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 504.484375, "epoch": 0.2916, "grad_norm": 5.905150495502928, "kl": 0.32373046875, "learning_rate": 1.7845599790313735e-05, "loss": 0.2186, "reward": 1.91015625, "reward_std": 0.3874828889966011, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 561.96875, "epoch": 0.292, "grad_norm": 0.4576128742675573, "kl": 0.25634765625, "learning_rate": 1.78369345732584e-05, "loss": 0.1772, "reward": 1.884765625, "reward_std": 0.35635800659656525, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 464.8359375, "epoch": 0.2924, "grad_norm": 0.5590020843553009, "kl": 0.29345703125, "learning_rate": 1.78282540776981e-05, "loss": 0.1821, "reward": 1.896484375, "reward_std": 0.3307974189519882, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 486.234375, "epoch": 0.2928, "grad_norm": 0.59045957279201, "kl": 0.29345703125, "learning_rate": 1.7819558320555902e-05, "loss": 0.1749, "reward": 1.9453125, "reward_std": 0.3449516147375107, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.921875, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 477.8125, "epoch": 0.2932, "grad_norm": 0.5344362547373311, "kl": 0.2724609375, "learning_rate": 1.7810847318784632e-05, "loss": 0.1467, "reward": 2.076171875, "reward_std": 0.270138680934906, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 428.0078125, "epoch": 0.2936, "grad_norm": 191.61982725037902, "kl": 0.330078125, "learning_rate": 1.780212108936684e-05, "loss": 0.158, "reward": 2.033203125, "reward_std": 0.33115020394325256, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 435.6328125, "epoch": 0.294, "grad_norm": 216.63291224916202, "kl": 0.5166015625, "learning_rate": 1.7793379649314743e-05, "loss": 0.1367, "reward": 2.013671875, "reward_std": 0.1934482902288437, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 424.1796875, "epoch": 0.2944, "grad_norm": 784.9559963734227, "kl": 0.56787109375, "learning_rate": 1.7784623015670237e-05, "loss": 0.1822, "reward": 1.9765625, "reward_std": 0.23999404907226562, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 460.1015625, "epoch": 0.2948, "grad_norm": 53.120547488599954, "kl": 0.3759765625, "learning_rate": 1.7775851205504823e-05, "loss": 0.11, "reward": 1.951171875, "reward_std": 0.2701386883854866, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 433.6796875, "epoch": 0.2952, "grad_norm": 6.17967122202342, "kl": 1.0, "learning_rate": 1.7767064235919594e-05, "loss": 0.1189, "reward": 1.97265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 462.3359375, "epoch": 0.2956, "grad_norm": 0.3606940723194356, "kl": 0.26953125, "learning_rate": 1.7758262124045195e-05, "loss": -0.0192, "reward": 2.302734375, "reward_std": 0.20015982538461685, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 410.40625, "epoch": 0.296, "grad_norm": 34.91928685929344, "kl": 0.3203125, "learning_rate": 1.7749444887041797e-05, "loss": 0.1035, "reward": 2.10546875, "reward_std": 0.25553157180547714, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 415.6015625, "epoch": 0.2964, "grad_norm": 64.69897555076612, "kl": 0.35302734375, "learning_rate": 1.7740612542099054e-05, "loss": 0.0924, "reward": 1.998046875, "reward_std": 0.31203506886959076, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 459.890625, "epoch": 0.2968, "grad_norm": 3.4942418363903287, "kl": 0.28759765625, "learning_rate": 1.7731765106436073e-05, "loss": 0.049, "reward": 2.029296875, "reward_std": 0.22347432374954224, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.990234375, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 441.9140625, "epoch": 0.2972, "grad_norm": 98.2035857040874, "kl": 0.8076171875, "learning_rate": 1.7722902597301385e-05, "loss": 0.0942, "reward": 2.140625, "reward_std": 0.25409550219774246, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 502.15625, "epoch": 0.2976, "grad_norm": 16394.134373643625, "kl": 50.2724609375, "learning_rate": 1.7714025031972904e-05, "loss": 2.0608, "reward": 1.986328125, "reward_std": 0.17220931500196457, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 452.25, "epoch": 0.298, "grad_norm": 83.03395553211175, "kl": 13.8486328125, "learning_rate": 1.7705132427757895e-05, "loss": 1.0251, "reward": 1.943359375, "reward_std": 0.3270547389984131, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.935546875, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 448.5234375, "epoch": 0.2984, "grad_norm": 2.889583781678565, "kl": 0.3076171875, "learning_rate": 1.7696224801992947e-05, "loss": 0.0202, "reward": 2.041015625, "reward_std": 0.18453482538461685, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 515.484375, "epoch": 0.2988, "grad_norm": 7.832418118017966, "kl": 0.439453125, "learning_rate": 1.7687302172043933e-05, "loss": 0.1711, "reward": 1.912109375, "reward_std": 0.35204415023326874, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 486.046875, "epoch": 0.2992, "grad_norm": 30.63294227524537, "kl": 0.43701171875, "learning_rate": 1.767836455530598e-05, "loss": 0.0562, "reward": 1.98828125, "reward_std": 0.15207062661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 483.40625, "epoch": 0.2996, "grad_norm": 18.24984905145706, "kl": 0.29638671875, "learning_rate": 1.7669411969203417e-05, "loss": 0.0198, "reward": 2.009765625, "reward_std": 0.12863312661647797, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 536.3515625, "epoch": 0.3, "grad_norm": 6.118176855175826, "kl": 0.92041015625, "learning_rate": 1.766044443118978e-05, "loss": 0.1283, "reward": 1.8671875, "reward_std": 0.3158567547798157, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9609375, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 494.0859375, "epoch": 0.3004, "grad_norm": 30.615976362910548, "kl": 2.6015625, "learning_rate": 1.7651461958747745e-05, "loss": 0.1565, "reward": 2.171875, "reward_std": 0.3534773215651512, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 517.1484375, "epoch": 0.3008, "grad_norm": 2.543791986288801, "kl": 1.32470703125, "learning_rate": 1.764246456938909e-05, "loss": 0.0767, "reward": 2.033203125, "reward_std": 0.12460917234420776, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 534.5078125, "epoch": 0.3012, "grad_norm": 492.2071819185291, "kl": 5.67578125, "learning_rate": 1.76334522806547e-05, "loss": 0.3301, "reward": 1.884765625, "reward_std": 0.4382587596774101, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.845703125, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 592.5078125, "epoch": 0.3016, "grad_norm": 33.87259165080516, "kl": 6.29296875, "learning_rate": 1.762442511011448e-05, "loss": 0.6588, "reward": 1.87890625, "reward_std": 0.521524041891098, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.91796875, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 605.3671875, "epoch": 0.302, "grad_norm": 1162.2283965211152, "kl": 44.80078125, "learning_rate": 1.761538307536737e-05, "loss": 1.9548, "reward": 1.623046875, "reward_std": 0.3893898278474808, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.6953125, "rewards/tag_count_reward": 0.794921875, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 672.84375, "epoch": 0.3024, "grad_norm": 308.7493970846926, "kl": 4.28076171875, "learning_rate": 1.7606326194041274e-05, "loss": 0.362, "reward": 1.552734375, "reward_std": 0.552101343870163, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.6171875, "rewards/tag_count_reward": 0.732421875, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 592.0, "epoch": 0.3028, "grad_norm": 18.661433812747287, "kl": 0.861328125, "learning_rate": 1.759725448379305e-05, "loss": 0.1282, "reward": 1.830078125, "reward_std": 0.47317079454660416, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.908203125, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 487.9765625, "epoch": 0.3032, "grad_norm": 12.723795217199015, "kl": 0.419921875, "learning_rate": 1.7588167962308458e-05, "loss": 0.1554, "reward": 1.921875, "reward_std": 0.1915779709815979, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 647.4765625, "epoch": 0.3036, "grad_norm": 251.0451641404087, "kl": 4.3291015625, "learning_rate": 1.7579066647302134e-05, "loss": 0.3631, "reward": 1.609375, "reward_std": 0.6433265954256058, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.7265625, "rewards/tag_count_reward": 0.8046875, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 619.03125, "epoch": 0.304, "grad_norm": 42.108970193855775, "kl": 0.69970703125, "learning_rate": 1.7569950556517566e-05, "loss": 0.1065, "reward": 1.99609375, "reward_std": 0.33547328412532806, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 704.984375, "epoch": 0.3044, "grad_norm": 223.2306187809389, "kl": 1.939453125, "learning_rate": 1.7560819707727034e-05, "loss": 0.2515, "reward": 1.35546875, "reward_std": 0.5808394700288773, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.609375, "rewards/tag_count_reward": 0.73046875, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 603.75, "epoch": 0.3048, "grad_norm": 57.611099811567, "kl": 0.939453125, "learning_rate": 1.7551674118731592e-05, "loss": 0.2425, "reward": 1.74609375, "reward_std": 0.5669402629137039, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.88671875, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 662.21875, "epoch": 0.3052, "grad_norm": 98.84431361908045, "kl": 1.6328125, "learning_rate": 1.754251380736104e-05, "loss": 0.2922, "reward": 1.525390625, "reward_std": 0.49993864446878433, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.7109375, "rewards/tag_count_reward": 0.814453125, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 675.140625, "epoch": 0.3056, "grad_norm": 48.68342467449964, "kl": 2.7626953125, "learning_rate": 1.7533338791473872e-05, "loss": 0.3159, "reward": 1.685546875, "reward_std": 0.42488177865743637, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.818359375, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 724.3359375, "epoch": 0.306, "grad_norm": 45.76142711992145, "kl": 1.33056640625, "learning_rate": 1.7524149088957244e-05, "loss": 0.2649, "reward": 1.638671875, "reward_std": 0.6757695525884628, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.833984375, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 707.625, "epoch": 0.3064, "grad_norm": 23.023273427140875, "kl": 2.35546875, "learning_rate": 1.7514944717726962e-05, "loss": 0.3263, "reward": 1.380859375, "reward_std": 0.5646215975284576, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.640625, "rewards/tag_count_reward": 0.716796875, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 776.3828125, "epoch": 0.3068, "grad_norm": 84.5280098198947, "kl": 3.0537109375, "learning_rate": 1.7505725695727414e-05, "loss": 0.33, "reward": 1.447265625, "reward_std": 0.5989357307553291, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.5234375, "rewards/tag_count_reward": 0.658203125, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 749.9609375, "epoch": 0.3072, "grad_norm": 790.8043615221726, "kl": 2.080078125, "learning_rate": 1.749649204093155e-05, "loss": 0.286, "reward": 1.375, "reward_std": 0.5413762852549553, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.5625, "rewards/tag_count_reward": 0.6875, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 669.9609375, "epoch": 0.3076, "grad_norm": 291.5843482239066, "kl": 1.33203125, "learning_rate": 1.7487243771340862e-05, "loss": 0.3498, "reward": 1.421875, "reward_std": 0.6802405416965485, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.640625, "rewards/tag_count_reward": 0.78125, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 778.453125, "epoch": 0.308, "grad_norm": 26.793404362688438, "kl": 3.212890625, "learning_rate": 1.747798090498532e-05, "loss": 0.3383, "reward": 1.138671875, "reward_std": 0.6122183501720428, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.484375, "rewards/tag_count_reward": 0.638671875, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 803.125, "epoch": 0.3084, "grad_norm": 73.92755259288336, "kl": 5.76953125, "learning_rate": 1.746870345992336e-05, "loss": 0.4755, "reward": 1.134765625, "reward_std": 0.6635657697916031, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.4609375, "rewards/tag_count_reward": 0.611328125, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 830.140625, "epoch": 0.3088, "grad_norm": 296.60048173746145, "kl": 10.2265625, "learning_rate": 1.7459411454241822e-05, "loss": 0.6736, "reward": 1.09375, "reward_std": 0.5950115658342838, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.375, "rewards/tag_count_reward": 0.5625, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 798.9296875, "epoch": 0.3092, "grad_norm": 233.63705379201673, "kl": 14.20703125, "learning_rate": 1.7450104906055963e-05, "loss": 0.9083, "reward": 1.232421875, "reward_std": 0.6778158247470856, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.4765625, "rewards/tag_count_reward": 0.630859375, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 832.8046875, "epoch": 0.3096, "grad_norm": 32.19192012506138, "kl": 7.171875, "learning_rate": 1.7440783833509366e-05, "loss": 0.548, "reward": 1.0546875, "reward_std": 0.6214631050825119, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.3828125, "rewards/tag_count_reward": 0.546875, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 747.9453125, "epoch": 0.31, "grad_norm": 91.33918298045508, "kl": 2.146484375, "learning_rate": 1.7431448254773943e-05, "loss": 0.3761, "reward": 1.322265625, "reward_std": 0.8206359893083572, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.5703125, "rewards/tag_count_reward": 0.720703125, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 931.3984375, "epoch": 0.3104, "grad_norm": 207.4163467919359, "kl": 3.859375, "learning_rate": 1.7422098188049885e-05, "loss": 0.259, "reward": 1.072265625, "reward_std": 0.5798581913113594, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.296875, "rewards/tag_count_reward": 0.486328125, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 829.5234375, "epoch": 0.3108, "grad_norm": 294.83025203630535, "kl": 3.201171875, "learning_rate": 1.741273365156561e-05, "loss": 0.4142, "reward": 0.9765625, "reward_std": 0.6908371821045876, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.3359375, "rewards/tag_count_reward": 0.515625, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 938.5703125, "epoch": 0.3112, "grad_norm": 401.92313836618956, "kl": 5.46484375, "learning_rate": 1.7403354663577782e-05, "loss": 0.3717, "reward": 0.7109375, "reward_std": 0.5296831093728542, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.1484375, "rewards/tag_count_reward": 0.421875, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 909.71875, "epoch": 0.3116, "grad_norm": 64.63595953297956, "kl": 4.173828125, "learning_rate": 1.7393961242371203e-05, "loss": 0.2622, "reward": 0.818359375, "reward_std": 0.34476715698838234, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.21875, "rewards/tag_count_reward": 0.451171875, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 838.171875, "epoch": 0.312, "grad_norm": 114.63295763475779, "kl": 3.5, "learning_rate": 1.7384553406258842e-05, "loss": 0.3441, "reward": 0.79296875, "reward_std": 0.4688112586736679, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.3046875, "rewards/tag_count_reward": 0.48828125, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 833.65625, "epoch": 0.3124, "grad_norm": 26.872884648506606, "kl": 2.32421875, "learning_rate": 1.737513117358174e-05, "loss": 0.3205, "reward": 0.869140625, "reward_std": 0.6069475710391998, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.34375, "rewards/tag_count_reward": 0.525390625, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 865.765625, "epoch": 0.3128, "grad_norm": 174.11524391592786, "kl": 6.451171875, "learning_rate": 1.7365694562709034e-05, "loss": 0.4397, "reward": 0.828125, "reward_std": 0.4794958382844925, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.3125, "rewards/tag_count_reward": 0.515625, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 736.984375, "epoch": 0.3132, "grad_norm": 652.3052244709436, "kl": 2.87109375, "learning_rate": 1.7356243592037876e-05, "loss": 0.437, "reward": 1.203125, "reward_std": 0.6136139035224915, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.53125, "rewards/tag_count_reward": 0.671875, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 814.078125, "epoch": 0.3136, "grad_norm": 18.598693940583846, "kl": 4.58984375, "learning_rate": 1.7346778279993417e-05, "loss": 0.4252, "reward": 0.953125, "reward_std": 0.4894583784043789, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.3828125, "rewards/tag_count_reward": 0.5703125, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 720.6953125, "epoch": 0.314, "grad_norm": 97.53312491998828, "kl": 3.98828125, "learning_rate": 1.7337298645028764e-05, "loss": 0.4688, "reward": 1.4140625, "reward_std": 0.754123330116272, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.6328125, "rewards/tag_count_reward": 0.765625, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 829.4453125, "epoch": 0.3144, "grad_norm": 1821.9190116911855, "kl": 12.921875, "learning_rate": 1.732780470562496e-05, "loss": 0.8336, "reward": 1.091796875, "reward_std": 0.742634192109108, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.3984375, "rewards/tag_count_reward": 0.544921875, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 758.4453125, "epoch": 0.3148, "grad_norm": 313.67669665107593, "kl": 9.71875, "learning_rate": 1.7318296480290912e-05, "loss": 0.7385, "reward": 1.5, "reward_std": 0.7637804299592972, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.625, "rewards/tag_count_reward": 0.75, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 720.4375, "epoch": 0.3152, "grad_norm": 127.7986094710488, "kl": 9.3203125, "learning_rate": 1.7308773987563406e-05, "loss": 0.7815, "reward": 1.37890625, "reward_std": 0.8106274455785751, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.625, "rewards/tag_count_reward": 0.73046875, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 848.0, "epoch": 0.3156, "grad_norm": 92.34866749091691, "kl": 8.6875, "learning_rate": 1.7299237246007018e-05, "loss": 0.4998, "reward": 0.95703125, "reward_std": 0.46751774847507477, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.390625, "rewards/tag_count_reward": 0.55859375, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 877.84375, "epoch": 0.316, "grad_norm": 36.14755702977983, "kl": 3.904296875, "learning_rate": 1.7289686274214116e-05, "loss": 0.4053, "reward": 0.9140625, "reward_std": 0.7105987668037415, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.296875, "rewards/tag_count_reward": 0.4921875, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 802.921875, "epoch": 0.3164, "grad_norm": 139.3450325452508, "kl": 5.375, "learning_rate": 1.7280121090804813e-05, "loss": 0.5323, "reward": 1.087890625, "reward_std": 0.4742478132247925, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.3828125, "rewards/tag_count_reward": 0.572265625, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 717.84375, "epoch": 0.3168, "grad_norm": 21.71452467531757, "kl": 1.814453125, "learning_rate": 1.727054171442692e-05, "loss": 0.37, "reward": 1.310546875, "reward_std": 0.6331155225634575, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.515625, "rewards/tag_count_reward": 0.669921875, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 809.578125, "epoch": 0.3172, "grad_norm": 27.50368868007465, "kl": 4.24609375, "learning_rate": 1.7260948163755918e-05, "loss": 0.4241, "reward": 0.978515625, "reward_std": 0.6940451189875603, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.40625, "rewards/tag_count_reward": 0.564453125, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 756.796875, "epoch": 0.3176, "grad_norm": 70.1698974908529, "kl": 5.58203125, "learning_rate": 1.7251340457494934e-05, "loss": 0.5137, "reward": 1.576171875, "reward_std": 0.7556234002113342, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.779296875, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 717.28125, "epoch": 0.318, "grad_norm": 2851.1702152209978, "kl": 10.90625, "learning_rate": 1.7241718614374678e-05, "loss": 0.9432, "reward": 1.232421875, "reward_std": 0.8415197134017944, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.53125, "rewards/tag_count_reward": 0.662109375, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 792.7109375, "epoch": 0.3184, "grad_norm": 281.6989708641852, "kl": 12.609375, "learning_rate": 1.7232082653153422e-05, "loss": 0.7896, "reward": 1.173828125, "reward_std": 0.6860866397619247, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.5078125, "rewards/tag_count_reward": 0.650390625, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 894.109375, "epoch": 0.3188, "grad_norm": 46.14378472280866, "kl": 10.375, "learning_rate": 1.722243259261697e-05, "loss": 0.6426, "reward": 0.7734375, "reward_std": 0.5711571741849184, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.265625, "rewards/tag_count_reward": 0.5078125, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 903.09375, "epoch": 0.3192, "grad_norm": 28.968474624024935, "kl": 4.125, "learning_rate": 1.721276845157861e-05, "loss": 0.3758, "reward": 0.677734375, "reward_std": 0.622653029859066, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.2265625, "rewards/tag_count_reward": 0.443359375, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 909.609375, "epoch": 0.3196, "grad_norm": 18.35449983729331, "kl": 2.8203125, "learning_rate": 1.720309024887907e-05, "loss": 0.3184, "reward": 0.666015625, "reward_std": 0.6467582061886787, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.2109375, "rewards/tag_count_reward": 0.439453125, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 858.4765625, "epoch": 0.32, "grad_norm": 9.612060439536078, "kl": 2.59765625, "learning_rate": 1.7193398003386514e-05, "loss": 0.3548, "reward": 0.826171875, "reward_std": 0.5738712027668953, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.25, "rewards/tag_count_reward": 0.482421875, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 845.4453125, "epoch": 0.3204, "grad_norm": 10.807110974617512, "kl": 1.689453125, "learning_rate": 1.7183691733996463e-05, "loss": 0.2885, "reward": 1.0, "reward_std": 0.7198735475540161, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.390625, "rewards/tag_count_reward": 0.5703125, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 823.90625, "epoch": 0.3208, "grad_norm": 22.905782051963314, "kl": 2.5703125, "learning_rate": 1.717397145963179e-05, "loss": 0.3369, "reward": 0.9765625, "reward_std": 0.4983821511268616, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.2734375, "rewards/tag_count_reward": 0.484375, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 825.59375, "epoch": 0.3212, "grad_norm": 5.936233740064358, "kl": 2.93359375, "learning_rate": 1.716423719924266e-05, "loss": 0.3099, "reward": 0.875, "reward_std": 0.5017623901367188, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.3359375, "rewards/tag_count_reward": 0.5390625, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 879.9296875, "epoch": 0.3216, "grad_norm": 7.430684343150555, "kl": 3.02734375, "learning_rate": 1.715448897180652e-05, "loss": 0.2546, "reward": 0.693359375, "reward_std": 0.36967140436172485, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.21875, "rewards/tag_count_reward": 0.427734375, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 850.2109375, "epoch": 0.322, "grad_norm": 6.634528378979898, "kl": 1.40625, "learning_rate": 1.7144726796328034e-05, "loss": 0.3158, "reward": 0.859375, "reward_std": 0.6907422244548798, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.328125, "rewards/tag_count_reward": 0.5234375, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 740.3515625, "epoch": 0.3224, "grad_norm": 5.2533530597688625, "kl": 0.9150390625, "learning_rate": 1.7134950691839063e-05, "loss": 0.3429, "reward": 1.12890625, "reward_std": 0.6787637174129486, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.453125, "rewards/tag_count_reward": 0.63671875, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 905.2421875, "epoch": 0.3228, "grad_norm": 2.86764819649873, "kl": 1.171875, "learning_rate": 1.7125160677398625e-05, "loss": 0.188, "reward": 0.626953125, "reward_std": 0.39397556334733963, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.1875, "rewards/tag_count_reward": 0.431640625, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 924.3671875, "epoch": 0.3232, "grad_norm": 11.498664471008402, "kl": 0.9228515625, "learning_rate": 1.7115356772092858e-05, "loss": 0.1801, "reward": 0.791015625, "reward_std": 0.5883963704109192, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.2109375, "rewards/tag_count_reward": 0.439453125, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 757.1484375, "epoch": 0.3236, "grad_norm": 5.501248884439278, "kl": 0.57763671875, "learning_rate": 1.710553899503496e-05, "loss": 0.2328, "reward": 1.34375, "reward_std": 0.6753109768033028, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.53125, "rewards/tag_count_reward": 0.6796875, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 903.296875, "epoch": 0.324, "grad_norm": 12.161848170246571, "kl": 0.99609375, "learning_rate": 1.709570736536521e-05, "loss": 0.201, "reward": 0.69921875, "reward_std": 0.5486800000071526, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.234375, "rewards/tag_count_reward": 0.46484375, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 646.15625, "epoch": 0.3244, "grad_norm": 736.1301888438037, "kl": 4.283203125, "learning_rate": 1.7085861902250864e-05, "loss": 0.4139, "reward": 1.478515625, "reward_std": 0.666233204305172, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.767578125, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 738.375, "epoch": 0.3248, "grad_norm": 54.957338785521294, "kl": 0.9423828125, "learning_rate": 1.7076002624886156e-05, "loss": 0.2938, "reward": 1.134765625, "reward_std": 0.5865841582417488, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.484375, "rewards/tag_count_reward": 0.642578125, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 761.25, "epoch": 0.3252, "grad_norm": 18.011806596796443, "kl": 1.80078125, "learning_rate": 1.706612955249225e-05, "loss": 0.3556, "reward": 1.208984375, "reward_std": 0.7678902745246887, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.4453125, "rewards/tag_count_reward": 0.623046875, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 639.609375, "epoch": 0.3256, "grad_norm": 2.3769339784906967, "kl": 0.73291015625, "learning_rate": 1.705624270431721e-05, "loss": 0.2715, "reward": 1.30078125, "reward_std": 0.6512450352311134, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.484375, "rewards/tag_count_reward": 0.66796875, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 527.8984375, "epoch": 0.326, "grad_norm": 0.7904900687198914, "kl": 0.38623046875, "learning_rate": 1.7046342099635948e-05, "loss": 0.1332, "reward": 1.875, "reward_std": 0.2891687750816345, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.953125, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 503.75, "epoch": 0.3264, "grad_norm": 3.042544714034103, "kl": 0.51025390625, "learning_rate": 1.7036427757750205e-05, "loss": 0.2642, "reward": 1.63671875, "reward_std": 0.5332868322730064, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.85546875, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 565.015625, "epoch": 0.3268, "grad_norm": 0.3709254885477703, "kl": 0.30322265625, "learning_rate": 1.7026499697988496e-05, "loss": 0.1943, "reward": 1.96875, "reward_std": 0.3136840686202049, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 558.90625, "epoch": 0.3272, "grad_norm": 2.1207718913002314, "kl": 0.48876953125, "learning_rate": 1.7016557939706075e-05, "loss": 0.236, "reward": 1.80859375, "reward_std": 0.45464274287223816, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.89453125, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 493.5, "epoch": 0.3276, "grad_norm": 1.3530269277411615, "kl": 0.5654296875, "learning_rate": 1.700660250228492e-05, "loss": 0.1921, "reward": 1.892578125, "reward_std": 0.36260994523763657, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.939453125, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 495.515625, "epoch": 0.328, "grad_norm": 0.8077993934111994, "kl": 0.3037109375, "learning_rate": 1.6996633405133656e-05, "loss": 0.1075, "reward": 1.931640625, "reward_std": 0.24865080043673515, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.978515625, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 536.390625, "epoch": 0.3284, "grad_norm": 2.247509978534387, "kl": 0.4296875, "learning_rate": 1.6986650667687552e-05, "loss": 0.159, "reward": 1.8671875, "reward_std": 0.3256009519100189, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 577.0625, "epoch": 0.3288, "grad_norm": 0.7802681882010234, "kl": 0.30810546875, "learning_rate": 1.6976654309408464e-05, "loss": 0.0643, "reward": 2.056640625, "reward_std": 0.28774184733629227, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 594.640625, "epoch": 0.3292, "grad_norm": 2.726628178127623, "kl": 0.3310546875, "learning_rate": 1.696664434978481e-05, "loss": 0.1573, "reward": 1.90234375, "reward_std": 0.3872654065489769, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.94921875, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 637.4609375, "epoch": 0.3296, "grad_norm": 2.577188915058494, "kl": 0.35009765625, "learning_rate": 1.695662080833151e-05, "loss": 0.1243, "reward": 1.810546875, "reward_std": 0.3515469878911972, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.912109375, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 591.3984375, "epoch": 0.33, "grad_norm": 0.650364691979574, "kl": 0.3369140625, "learning_rate": 1.6946583704589973e-05, "loss": 0.1487, "reward": 1.8125, "reward_std": 0.4977307692170143, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9140625, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 587.328125, "epoch": 0.3304, "grad_norm": 0.3590528046846324, "kl": 0.2734375, "learning_rate": 1.693653305812805e-05, "loss": 0.1183, "reward": 1.95703125, "reward_std": 0.3578726053237915, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 601.9296875, "epoch": 0.3308, "grad_norm": 0.4217418848913955, "kl": 0.261962890625, "learning_rate": 1.6926468888539988e-05, "loss": 0.1434, "reward": 1.966796875, "reward_std": 0.45960937440395355, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.935546875, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 589.2265625, "epoch": 0.3312, "grad_norm": 0.2518938753110878, "kl": 0.277099609375, "learning_rate": 1.6916391215446403e-05, "loss": 0.1733, "reward": 1.83203125, "reward_std": 0.3661922439932823, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.91796875, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 493.328125, "epoch": 0.3316, "grad_norm": 0.828835256494598, "kl": 0.34423828125, "learning_rate": 1.690630005849423e-05, "loss": 0.1143, "reward": 1.958984375, "reward_std": 0.24218746274709702, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 665.3359375, "epoch": 0.332, "grad_norm": 0.37220531539843726, "kl": 0.232177734375, "learning_rate": 1.68961954373567e-05, "loss": 0.0865, "reward": 1.83203125, "reward_std": 0.3841215670108795, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.93359375, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 766.1484375, "epoch": 0.3324, "grad_norm": 0.5285965885057925, "kl": 0.226318359375, "learning_rate": 1.6886077371733285e-05, "loss": 0.1066, "reward": 1.49609375, "reward_std": 0.4776950031518936, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.6953125, "rewards/tag_count_reward": 0.78515625, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 595.4921875, "epoch": 0.3328, "grad_norm": 0.2814987529701349, "kl": 0.225341796875, "learning_rate": 1.6875945881349676e-05, "loss": 0.0568, "reward": 2.14453125, "reward_std": 0.20046419650316238, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 726.3828125, "epoch": 0.3332, "grad_norm": 0.3736612635544101, "kl": 0.261962890625, "learning_rate": 1.686580098595773e-05, "loss": 0.0458, "reward": 1.93359375, "reward_std": 0.24255750328302383, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 670.46875, "epoch": 0.3336, "grad_norm": 0.23842435674424658, "kl": 0.2421875, "learning_rate": 1.6855642705335438e-05, "loss": 0.0317, "reward": 1.84765625, "reward_std": 0.20270764082670212, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.91796875, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 694.3828125, "epoch": 0.334, "grad_norm": 0.31452335262759645, "kl": 0.2734375, "learning_rate": 1.684547105928689e-05, "loss": 0.0798, "reward": 2.1953125, "reward_std": 0.2880333885550499, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 638.6796875, "epoch": 0.3344, "grad_norm": 0.3244435098235616, "kl": 0.2421875, "learning_rate": 1.6835286067642228e-05, "loss": 0.0791, "reward": 2.015625, "reward_std": 0.3024388328194618, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 686.671875, "epoch": 0.3348, "grad_norm": 0.3079878512589103, "kl": 0.2197265625, "learning_rate": 1.6825087750257617e-05, "loss": 0.0451, "reward": 1.953125, "reward_std": 0.31529550999403, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 754.515625, "epoch": 0.3352, "grad_norm": 0.251529058851852, "kl": 0.206298828125, "learning_rate": 1.68148761270152e-05, "loss": 0.0583, "reward": 1.87890625, "reward_std": 0.30541761219501495, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.96484375, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 744.890625, "epoch": 0.3356, "grad_norm": 0.27060030257852297, "kl": 0.220703125, "learning_rate": 1.6804651217823055e-05, "loss": 0.0868, "reward": 1.9453125, "reward_std": 0.42521536350250244, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.953125, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 684.0, "epoch": 0.336, "grad_norm": 0.18783829425700116, "kl": 0.2158203125, "learning_rate": 1.6794413042615168e-05, "loss": 0.0275, "reward": 1.970703125, "reward_std": 0.12634295225143433, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 610.1171875, "epoch": 0.3364, "grad_norm": 0.11663258310260098, "kl": 0.23388671875, "learning_rate": 1.6784161621351384e-05, "loss": 0.0037, "reward": 2.1796875, "reward_std": 0.06404343992471695, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 578.359375, "epoch": 0.3368, "grad_norm": 0.2420985512874179, "kl": 0.224365234375, "learning_rate": 1.6773896974017373e-05, "loss": 0.0373, "reward": 2.017578125, "reward_std": 0.14007875323295593, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 610.9453125, "epoch": 0.3372, "grad_norm": 0.3083968708006145, "kl": 0.224609375, "learning_rate": 1.6763619120624595e-05, "loss": 0.0418, "reward": 2.244140625, "reward_std": 0.20903604477643967, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 654.921875, "epoch": 0.3376, "grad_norm": 0.22782097279828914, "kl": 0.231201171875, "learning_rate": 1.6753328081210244e-05, "loss": 0.062, "reward": 2.03125, "reward_std": 0.26830074191093445, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 647.3125, "epoch": 0.338, "grad_norm": 0.2311318523987871, "kl": 0.23974609375, "learning_rate": 1.6743023875837233e-05, "loss": 0.0711, "reward": 2.0625, "reward_std": 0.39606328308582306, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 618.4296875, "epoch": 0.3384, "grad_norm": 0.24576698783688836, "kl": 0.2392578125, "learning_rate": 1.6732706524594138e-05, "loss": 0.0152, "reward": 2.0859375, "reward_std": 0.10673906654119492, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 637.03125, "epoch": 0.3388, "grad_norm": 0.1824875447075993, "kl": 0.244140625, "learning_rate": 1.6722376047595163e-05, "loss": 0.0694, "reward": 2.140625, "reward_std": 0.2810143083333969, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 571.734375, "epoch": 0.3392, "grad_norm": 0.3329069997624055, "kl": 0.254150390625, "learning_rate": 1.6712032464980094e-05, "loss": 0.0139, "reward": 2.017578125, "reward_std": 0.11058919876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 648.203125, "epoch": 0.3396, "grad_norm": 0.23783622893592382, "kl": 0.21240234375, "learning_rate": 1.6701675796914284e-05, "loss": 0.0188, "reward": 2.064453125, "reward_std": 0.2101312279701233, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 722.28125, "epoch": 0.34, "grad_norm": 0.8745745778889253, "kl": 0.26220703125, "learning_rate": 1.6691306063588583e-05, "loss": 0.0888, "reward": 1.833984375, "reward_std": 0.29844629019498825, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.935546875, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 553.578125, "epoch": 0.3404, "grad_norm": 0.1325611028234062, "kl": 0.22021484375, "learning_rate": 1.668092328521932e-05, "loss": 0.0097, "reward": 2.15234375, "reward_std": 0.08218611031770706, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 673.984375, "epoch": 0.3408, "grad_norm": 0.19864399389851176, "kl": 0.209228515625, "learning_rate": 1.6670527482048246e-05, "loss": 0.0334, "reward": 2.00390625, "reward_std": 0.16527669876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 681.5, "epoch": 0.3412, "grad_norm": 0.5715221371877427, "kl": 0.296875, "learning_rate": 1.666011867434252e-05, "loss": 0.037, "reward": 1.998046875, "reward_std": 0.1519516110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 715.7578125, "epoch": 0.3416, "grad_norm": 0.16224438830622226, "kl": 0.194580078125, "learning_rate": 1.6649696882394635e-05, "loss": 0.0045, "reward": 2.005859375, "reward_std": 0.08175812661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 776.9375, "epoch": 0.342, "grad_norm": 0.19156607557235328, "kl": 0.200439453125, "learning_rate": 1.6639262126522417e-05, "loss": 0.0366, "reward": 1.966796875, "reward_std": 0.1328125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 732.5859375, "epoch": 0.3424, "grad_norm": 0.3059096816457321, "kl": 0.20751953125, "learning_rate": 1.6628814427068954e-05, "loss": 0.0558, "reward": 2.08203125, "reward_std": 0.3226889371871948, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 760.84375, "epoch": 0.3428, "grad_norm": 0.16525782005507342, "kl": 0.212158203125, "learning_rate": 1.6618353804402567e-05, "loss": 0.0353, "reward": 2.015625, "reward_std": 0.15358919650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 730.9140625, "epoch": 0.3432, "grad_norm": 0.28731085385691846, "kl": 0.22216796875, "learning_rate": 1.6607880278916778e-05, "loss": 0.0406, "reward": 2.078125, "reward_std": 0.27775172144174576, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 784.515625, "epoch": 0.3436, "grad_norm": 0.3003148535889357, "kl": 0.208251953125, "learning_rate": 1.6597393871030264e-05, "loss": 0.0474, "reward": 1.9140625, "reward_std": 0.29117918759584427, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9765625, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 731.8671875, "epoch": 0.344, "grad_norm": 1.0665846644125105, "kl": 0.2392578125, "learning_rate": 1.6586894601186804e-05, "loss": 0.0261, "reward": 2.171875, "reward_std": 0.12909550219774246, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 742.1015625, "epoch": 0.3444, "grad_norm": 0.3305642112828381, "kl": 0.206298828125, "learning_rate": 1.6576382489855274e-05, "loss": 0.0368, "reward": 2.072265625, "reward_std": 0.382358118891716, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.978515625, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 745.1953125, "epoch": 0.3448, "grad_norm": 0.1615406889970593, "kl": 0.2041015625, "learning_rate": 1.6565857557529567e-05, "loss": 0.0415, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 696.703125, "epoch": 0.3452, "grad_norm": 1.1043473767599, "kl": 0.3037109375, "learning_rate": 1.6555319824728577e-05, "loss": 0.0768, "reward": 1.92578125, "reward_std": 0.3555070236325264, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94921875, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 657.25, "epoch": 0.3456, "grad_norm": 0.2346761908777861, "kl": 0.2158203125, "learning_rate": 1.654476931199615e-05, "loss": 0.0467, "reward": 2.119140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 685.40625, "epoch": 0.346, "grad_norm": 0.29202383194836834, "kl": 0.22607421875, "learning_rate": 1.6534206039901057e-05, "loss": 0.0268, "reward": 2.033203125, "reward_std": 0.21248093992471695, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 685.1640625, "epoch": 0.3464, "grad_norm": 0.18726255101940945, "kl": 0.224365234375, "learning_rate": 1.652363002903693e-05, "loss": 0.0383, "reward": 2.11328125, "reward_std": 0.13780806958675385, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 652.8984375, "epoch": 0.3468, "grad_norm": 0.27782187990133794, "kl": 0.206298828125, "learning_rate": 1.6513041300022253e-05, "loss": 0.0038, "reward": 2.0546875, "reward_std": 0.14965169876813889, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 617.53125, "epoch": 0.3472, "grad_norm": 0.2876270954445877, "kl": 0.226318359375, "learning_rate": 1.650243987350029e-05, "loss": 0.0594, "reward": 2.01953125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 663.1171875, "epoch": 0.3476, "grad_norm": 0.26431339962923134, "kl": 0.214599609375, "learning_rate": 1.649182577013906e-05, "loss": -0.001, "reward": 2.005859375, "reward_std": 0.1015625, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 669.734375, "epoch": 0.348, "grad_norm": 0.17504844436542175, "kl": 0.222412109375, "learning_rate": 1.6481199010631312e-05, "loss": 0.0155, "reward": 2.056640625, "reward_std": 0.13663385808467865, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 694.3125, "epoch": 0.3484, "grad_norm": 0.14373938494221, "kl": 0.2197265625, "learning_rate": 1.6470559615694445e-05, "loss": 0.0212, "reward": 2.017578125, "reward_std": 0.11515908688306808, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 637.1796875, "epoch": 0.3488, "grad_norm": 0.17683640863325537, "kl": 0.225830078125, "learning_rate": 1.6459907606070513e-05, "loss": 0.0222, "reward": 2.076171875, "reward_std": 0.11528604477643967, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 742.5078125, "epoch": 0.3492, "grad_norm": 0.2794284984089648, "kl": 0.23291015625, "learning_rate": 1.6449243002526146e-05, "loss": 0.0235, "reward": 2.01171875, "reward_std": 0.1387607902288437, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 659.9375, "epoch": 0.3496, "grad_norm": 0.2573620673098058, "kl": 0.222412109375, "learning_rate": 1.643856582585254e-05, "loss": 0.0148, "reward": 1.994140625, "reward_std": 0.12863312661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 732.765625, "epoch": 0.35, "grad_norm": 0.23254631405177748, "kl": 0.2099609375, "learning_rate": 1.6427876096865394e-05, "loss": 0.0321, "reward": 1.984375, "reward_std": 0.125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 756.5, "epoch": 0.3504, "grad_norm": 0.20970682969754306, "kl": 0.20166015625, "learning_rate": 1.6417173836404888e-05, "loss": 0.0311, "reward": 1.97265625, "reward_std": 0.16080472618341446, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 778.46875, "epoch": 0.3508, "grad_norm": 0.24141290136805707, "kl": 0.22900390625, "learning_rate": 1.6406459065335616e-05, "loss": 0.0354, "reward": 1.951171875, "reward_std": 0.23577880859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 655.0625, "epoch": 0.3512, "grad_norm": 0.22921263741270062, "kl": 0.202392578125, "learning_rate": 1.6395731804546582e-05, "loss": 0.0316, "reward": 2.02734375, "reward_std": 0.17341843992471695, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 712.734375, "epoch": 0.3516, "grad_norm": 0.22881782949036716, "kl": 0.212158203125, "learning_rate": 1.6384992074951124e-05, "loss": 0.0447, "reward": 1.962890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 695.59375, "epoch": 0.352, "grad_norm": 0.20417136860260973, "kl": 0.20361328125, "learning_rate": 1.63742398974869e-05, "loss": 0.0318, "reward": 2.095703125, "reward_std": 0.09243203699588776, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.994140625, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 723.515625, "epoch": 0.3524, "grad_norm": 0.48386004553587925, "kl": 0.2958984375, "learning_rate": 1.6363475293115824e-05, "loss": 0.0697, "reward": 2.041015625, "reward_std": 0.37697891891002655, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.931640625, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 743.1015625, "epoch": 0.3528, "grad_norm": 0.2325664124381673, "kl": 0.22314453125, "learning_rate": 1.6352698282824045e-05, "loss": 0.0321, "reward": 1.96484375, "reward_std": 0.2210279107093811, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 728.3671875, "epoch": 0.3532, "grad_norm": 0.2659667914607042, "kl": 0.180419921875, "learning_rate": 1.6341908887621894e-05, "loss": 0.0243, "reward": 2.14453125, "reward_std": 0.17120973765850067, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 765.34375, "epoch": 0.3536, "grad_norm": 0.325678665980606, "kl": 0.216796875, "learning_rate": 1.6331107128543856e-05, "loss": 0.0529, "reward": 1.94921875, "reward_std": 0.40390094369649887, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 788.1015625, "epoch": 0.354, "grad_norm": 0.23894600077181602, "kl": 0.174560546875, "learning_rate": 1.632029302664851e-05, "loss": 0.07, "reward": 2.009765625, "reward_std": 0.36466552317142487, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 727.4375, "epoch": 0.3544, "grad_norm": 0.32316617152752675, "kl": 0.185546875, "learning_rate": 1.6309466603018497e-05, "loss": 0.0154, "reward": 2.06640625, "reward_std": 0.35153625905513763, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.98828125, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 638.0703125, "epoch": 0.3548, "grad_norm": 0.3159280422709259, "kl": 0.19970703125, "learning_rate": 1.6298627878760488e-05, "loss": 0.0434, "reward": 2.07421875, "reward_std": 0.33142898976802826, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 805.4296875, "epoch": 0.3552, "grad_norm": 0.2365272230545033, "kl": 0.2138671875, "learning_rate": 1.628777687500513e-05, "loss": 0.0344, "reward": 1.9375, "reward_std": 0.2004890739917755, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.984375, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 692.328125, "epoch": 0.3556, "grad_norm": 0.3121578740962895, "kl": 0.19580078125, "learning_rate": 1.6276913612907005e-05, "loss": 0.0747, "reward": 2.07421875, "reward_std": 0.36400531977415085, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 665.8515625, "epoch": 0.356, "grad_norm": 0.27377784059265325, "kl": 0.211181640625, "learning_rate": 1.6266038113644605e-05, "loss": 0.0528, "reward": 1.966796875, "reward_std": 0.2033504769206047, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 673.8984375, "epoch": 0.3564, "grad_norm": 0.23313048882055187, "kl": 0.205078125, "learning_rate": 1.6255150398420273e-05, "loss": 0.0477, "reward": 2.056640625, "reward_std": 0.30715592950582504, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 663.328125, "epoch": 0.3568, "grad_norm": 0.28227219561439093, "kl": 0.198974609375, "learning_rate": 1.624425048846016e-05, "loss": 0.0476, "reward": 2.0078125, "reward_std": 0.23225802183151245, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9453125, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 703.8359375, "epoch": 0.3572, "grad_norm": 0.29138529708873107, "kl": 0.199951171875, "learning_rate": 1.6233338405014204e-05, "loss": 0.0433, "reward": 2.037109375, "reward_std": 0.25935593992471695, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 740.1015625, "epoch": 0.3576, "grad_norm": 0.21687269282469493, "kl": 0.212890625, "learning_rate": 1.6222414169356066e-05, "loss": 0.036, "reward": 1.900390625, "reward_std": 0.19459576904773712, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.970703125, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 655.328125, "epoch": 0.358, "grad_norm": 0.25055932366910066, "kl": 0.214599609375, "learning_rate": 1.6211477802783105e-05, "loss": 0.0222, "reward": 1.994140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 595.4453125, "epoch": 0.3584, "grad_norm": 0.3633257574433172, "kl": 0.18994140625, "learning_rate": 1.620052932661633e-05, "loss": 0.0653, "reward": 2.216796875, "reward_std": 0.31312668323516846, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 650.0546875, "epoch": 0.3588, "grad_norm": 0.3170627804439708, "kl": 0.189697265625, "learning_rate": 1.618956876220035e-05, "loss": 0.0388, "reward": 2.12109375, "reward_std": 0.31680191308259964, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 601.296875, "epoch": 0.3592, "grad_norm": 0.29790949409965434, "kl": 0.1953125, "learning_rate": 1.6178596130903345e-05, "loss": 0.0729, "reward": 1.953125, "reward_std": 0.1875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 635.875, "epoch": 0.3596, "grad_norm": 0.42739860329338997, "kl": 0.2021484375, "learning_rate": 1.6167611454117027e-05, "loss": 0.0616, "reward": 1.966796875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 670.5859375, "epoch": 0.36, "grad_norm": 0.36184381107864655, "kl": 0.19775390625, "learning_rate": 1.6156614753256583e-05, "loss": 0.0352, "reward": 1.98828125, "reward_std": 0.2414828985929489, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 654.0234375, "epoch": 0.3604, "grad_norm": 1.2128584082902818, "kl": 0.2626953125, "learning_rate": 1.6145606049760644e-05, "loss": 0.0673, "reward": 1.880859375, "reward_std": 0.23196203261613846, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 581.484375, "epoch": 0.3608, "grad_norm": 0.4476100251215969, "kl": 0.18408203125, "learning_rate": 1.6134585365091243e-05, "loss": 0.0335, "reward": 2.09765625, "reward_std": 0.24505160003900528, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 593.5703125, "epoch": 0.3612, "grad_norm": 6.433205722273589, "kl": 0.59521484375, "learning_rate": 1.6123552720733767e-05, "loss": 0.1283, "reward": 1.982421875, "reward_std": 0.3316171169281006, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 618.265625, "epoch": 0.3616, "grad_norm": 28.927397889254635, "kl": 2.909423828125, "learning_rate": 1.611250813819692e-05, "loss": 0.2178, "reward": 2.083984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 742.8125, "epoch": 0.362, "grad_norm": 7.684622050593125, "kl": 0.888916015625, "learning_rate": 1.610145163901268e-05, "loss": 0.1307, "reward": 1.787109375, "reward_std": 0.3771911785006523, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.912109375, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 674.96875, "epoch": 0.3624, "grad_norm": 0.8838106681148085, "kl": 0.255859375, "learning_rate": 1.6090383244736256e-05, "loss": 0.0593, "reward": 1.97265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 709.546875, "epoch": 0.3628, "grad_norm": 0.215850560484115, "kl": 0.238525390625, "learning_rate": 1.6079302976946055e-05, "loss": 0.0197, "reward": 2.119140625, "reward_std": 0.16250257194042206, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 680.375, "epoch": 0.3632, "grad_norm": 0.9857928335482166, "kl": 0.35009765625, "learning_rate": 1.6068210857243625e-05, "loss": 0.0543, "reward": 2.068359375, "reward_std": 0.21318094432353973, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 699.671875, "epoch": 0.3636, "grad_norm": 0.9256505978090341, "kl": 0.21142578125, "learning_rate": 1.6057106907253617e-05, "loss": 0.0318, "reward": 2.01171875, "reward_std": 0.16921419650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 708.046875, "epoch": 0.364, "grad_norm": 0.22284569674515745, "kl": 0.266845703125, "learning_rate": 1.6045991148623752e-05, "loss": 0.0242, "reward": 2.025390625, "reward_std": 0.14183919876813889, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 716.546875, "epoch": 0.3644, "grad_norm": 0.3576454231495911, "kl": 0.22216796875, "learning_rate": 1.6034863603024768e-05, "loss": 0.0209, "reward": 1.990234375, "reward_std": 0.0390625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 695.984375, "epoch": 0.3648, "grad_norm": 0.4657901534243265, "kl": 0.3037109375, "learning_rate": 1.6023724292150387e-05, "loss": 0.073, "reward": 1.990234375, "reward_std": 0.25440484285354614, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 649.875, "epoch": 0.3652, "grad_norm": 0.160951689112283, "kl": 0.1943359375, "learning_rate": 1.601257323771727e-05, "loss": 0.0125, "reward": 2.02734375, "reward_std": 0.1544090062379837, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 724.3125, "epoch": 0.3656, "grad_norm": 0.2001411076958709, "kl": 0.216064453125, "learning_rate": 1.6001410461464955e-05, "loss": 0.0213, "reward": 2.166015625, "reward_std": 0.14336346089839935, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 791.5234375, "epoch": 0.366, "grad_norm": 49.441937761365025, "kl": 1.81591796875, "learning_rate": 1.599023598515586e-05, "loss": 0.1383, "reward": 2.009765625, "reward_std": 0.40806030482053757, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.970703125, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 707.484375, "epoch": 0.3664, "grad_norm": 0.19623451743330658, "kl": 0.20458984375, "learning_rate": 1.597904983057519e-05, "loss": 0.0162, "reward": 2.166015625, "reward_std": 0.11873093992471695, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 714.609375, "epoch": 0.3668, "grad_norm": 0.254321763821097, "kl": 0.19970703125, "learning_rate": 1.596785201953093e-05, "loss": 0.0183, "reward": 2.171875, "reward_std": 0.11840169876813889, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 716.7265625, "epoch": 0.3672, "grad_norm": 0.2839919439191706, "kl": 0.21533203125, "learning_rate": 1.5956642573853784e-05, "loss": 0.0523, "reward": 2.13671875, "reward_std": 0.3119678720831871, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 691.3828125, "epoch": 0.3676, "grad_norm": 0.9164185129439302, "kl": 0.265869140625, "learning_rate": 1.5945421515397135e-05, "loss": 0.0495, "reward": 2.14453125, "reward_std": 0.2786979004740715, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 574.6484375, "epoch": 0.368, "grad_norm": 0.1780493311663793, "kl": 0.20361328125, "learning_rate": 1.5934188866037017e-05, "loss": 0.0083, "reward": 2.140625, "reward_std": 0.042695626616477966, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 635.9765625, "epoch": 0.3684, "grad_norm": 0.29039495511858354, "kl": 0.19921875, "learning_rate": 1.592294464767205e-05, "loss": 0.0331, "reward": 2.197265625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 684.53125, "epoch": 0.3688, "grad_norm": 0.658975128714615, "kl": 0.224609375, "learning_rate": 1.591168888222342e-05, "loss": 0.0074, "reward": 2.1328125, "reward_std": 0.03125, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 674.171875, "epoch": 0.3692, "grad_norm": 0.4106746804670099, "kl": 0.244873046875, "learning_rate": 1.5900421591634813e-05, "loss": 0.0715, "reward": 2.189453125, "reward_std": 0.24960917234420776, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 685.3203125, "epoch": 0.3696, "grad_norm": 0.4181429791711812, "kl": 0.237060546875, "learning_rate": 1.5889142797872387e-05, "loss": 0.0367, "reward": 2.216796875, "reward_std": 0.16065485030412674, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 756.3984375, "epoch": 0.37, "grad_norm": 0.24535227948848418, "kl": 0.20361328125, "learning_rate": 1.5877852522924733e-05, "loss": 0.0423, "reward": 1.978515625, "reward_std": 0.19056735932826996, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.986328125, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 669.7265625, "epoch": 0.3704, "grad_norm": 67.53583448549404, "kl": 0.191162109375, "learning_rate": 1.5866550788802815e-05, "loss": 0.0346, "reward": 2.169921875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 661.1015625, "epoch": 0.3708, "grad_norm": 67.31917478516255, "kl": 0.22509765625, "learning_rate": 1.5855237617539943e-05, "loss": 0.0666, "reward": 2.1171875, "reward_std": 0.13644562661647797, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 672.96875, "epoch": 0.3712, "grad_norm": 43.87140434839348, "kl": 0.2470703125, "learning_rate": 1.5843913031191722e-05, "loss": 0.0405, "reward": 1.97265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 621.59375, "epoch": 0.3716, "grad_norm": 1916.9048630959433, "kl": 4.84765625, "learning_rate": 1.5832577051836016e-05, "loss": 0.3908, "reward": 2.005859375, "reward_std": 0.32853076606988907, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 682.5703125, "epoch": 0.372, "grad_norm": 3546.3627653073236, "kl": 0.49365234375, "learning_rate": 1.5821229701572897e-05, "loss": 0.0371, "reward": 1.99609375, "reward_std": 0.1486629769206047, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 657.828125, "epoch": 0.3724, "grad_norm": 1970.8958214072602, "kl": 2.4921875, "learning_rate": 1.5809871002524602e-05, "loss": 0.2037, "reward": 2.0234375, "reward_std": 0.2950340062379837, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 625.28125, "epoch": 0.3728, "grad_norm": 0.1893414479650944, "kl": 0.2373046875, "learning_rate": 1.5798500976835493e-05, "loss": 0.0162, "reward": 2.140625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 670.7265625, "epoch": 0.3732, "grad_norm": 1668.6480927107261, "kl": 2.309814453125, "learning_rate": 1.5787119646672025e-05, "loss": 0.1102, "reward": 2.0078125, "reward_std": 0.12537535279989243, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 694.203125, "epoch": 0.3736, "grad_norm": 836.6080686527883, "kl": 15.509521484375, "learning_rate": 1.5775727034222675e-05, "loss": 0.9199, "reward": 1.916015625, "reward_std": 0.23966552317142487, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 704.28125, "epoch": 0.374, "grad_norm": 5131.27576655744, "kl": 247.783203125, "learning_rate": 1.5764323161697933e-05, "loss": 11.9359, "reward": 1.8203125, "reward_std": 0.317649282515049, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.9296875, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 666.7890625, "epoch": 0.3744, "grad_norm": 28.572437314058806, "kl": 0.744140625, "learning_rate": 1.575290805133023e-05, "loss": 0.1554, "reward": 1.890625, "reward_std": 0.48727255314588547, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.9375, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 688.0859375, "epoch": 0.3748, "grad_norm": 650.9284000204301, "kl": 0.88232421875, "learning_rate": 1.57414817253739e-05, "loss": 0.1776, "reward": 1.931640625, "reward_std": 0.5661936923861504, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.892578125, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 680.7109375, "epoch": 0.3752, "grad_norm": 14.674113397500681, "kl": 0.9111328125, "learning_rate": 1.5730044206105156e-05, "loss": 0.1861, "reward": 1.56640625, "reward_std": 0.5089903175830841, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.734375, "rewards/tag_count_reward": 0.83203125, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 720.5625, "epoch": 0.3756, "grad_norm": 32.47785419683219, "kl": 1.64013671875, "learning_rate": 1.5718595515822027e-05, "loss": 0.1642, "reward": 1.533203125, "reward_std": 0.42311910539865494, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.71875, "rewards/tag_count_reward": 0.798828125, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 700.203125, "epoch": 0.376, "grad_norm": 16.405512108948237, "kl": 0.60791015625, "learning_rate": 1.570713567684432e-05, "loss": 0.1124, "reward": 1.728515625, "reward_std": 0.3494833707809448, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.876953125, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 674.3515625, "epoch": 0.3764, "grad_norm": 13.333921215156527, "kl": 0.57470703125, "learning_rate": 1.5695664711513575e-05, "loss": 0.1083, "reward": 1.71875, "reward_std": 0.38814688846468925, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.8515625, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 646.265625, "epoch": 0.3768, "grad_norm": 17.948543018798127, "kl": 0.64501953125, "learning_rate": 1.568418264219303e-05, "loss": 0.1397, "reward": 1.8125, "reward_std": 0.38614753633737564, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.8984375, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 704.5625, "epoch": 0.3772, "grad_norm": 23.790423503192265, "kl": 0.41455078125, "learning_rate": 1.567268949126757e-05, "loss": 0.1268, "reward": 1.91015625, "reward_std": 0.43036141991615295, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.91796875, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 667.0859375, "epoch": 0.3776, "grad_norm": 20.25301999423248, "kl": 0.40380859375, "learning_rate": 1.5661185281143666e-05, "loss": 0.1087, "reward": 1.96484375, "reward_std": 0.4196759909391403, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.95703125, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 676.84375, "epoch": 0.378, "grad_norm": 32.09149920993311, "kl": 0.455078125, "learning_rate": 1.564967003424938e-05, "loss": 0.1375, "reward": 1.83203125, "reward_std": 0.4237711876630783, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.94140625, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 633.3203125, "epoch": 0.3784, "grad_norm": 9.751999712231813, "kl": 0.34326171875, "learning_rate": 1.5638143773034268e-05, "loss": 0.069, "reward": 2.201171875, "reward_std": 0.23569566011428833, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 670.046875, "epoch": 0.3788, "grad_norm": 40.33144451175942, "kl": 0.48974609375, "learning_rate": 1.562660651996937e-05, "loss": 0.1473, "reward": 1.984375, "reward_std": 0.393785685300827, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9453125, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 662.8203125, "epoch": 0.3792, "grad_norm": 5.317326727458975, "kl": 0.27587890625, "learning_rate": 1.5615058297547144e-05, "loss": 0.0345, "reward": 2.1328125, "reward_std": 0.286354124546051, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 672.9609375, "epoch": 0.3796, "grad_norm": 8.48011599662747, "kl": 0.42333984375, "learning_rate": 1.5603499128281447e-05, "loss": 0.117, "reward": 1.8828125, "reward_std": 0.39168349653482437, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 553.3203125, "epoch": 0.38, "grad_norm": 15.864000737891079, "kl": 0.33935546875, "learning_rate": 1.5591929034707468e-05, "loss": 0.0935, "reward": 1.96484375, "reward_std": 0.26728852838277817, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 648.0, "epoch": 0.3804, "grad_norm": 5.016453929273674, "kl": 0.34716796875, "learning_rate": 1.55803480393817e-05, "loss": 0.0841, "reward": 1.919921875, "reward_std": 0.34988612681627274, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 610.9375, "epoch": 0.3808, "grad_norm": 11.626817422431191, "kl": 0.394287109375, "learning_rate": 1.556875616488188e-05, "loss": 0.1151, "reward": 1.921875, "reward_std": 0.4301172196865082, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9453125, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 629.921875, "epoch": 0.3812, "grad_norm": 1.7457622978121183, "kl": 0.4619140625, "learning_rate": 1.5557153433806967e-05, "loss": 0.1355, "reward": 1.95703125, "reward_std": 0.456713542342186, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 682.2109375, "epoch": 0.3816, "grad_norm": 0.4761433882386828, "kl": 0.287109375, "learning_rate": 1.5545539868777075e-05, "loss": 0.1044, "reward": 1.958984375, "reward_std": 0.3893163427710533, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 592.8671875, "epoch": 0.382, "grad_norm": 40.559225682052734, "kl": 0.95556640625, "learning_rate": 1.553391549243344e-05, "loss": 0.1017, "reward": 1.974609375, "reward_std": 0.21700367331504822, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 650.4921875, "epoch": 0.3824, "grad_norm": 1.1420366262042236, "kl": 0.345703125, "learning_rate": 1.5522280327438388e-05, "loss": 0.1189, "reward": 1.908203125, "reward_std": 0.35407766699790955, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 643.6796875, "epoch": 0.3828, "grad_norm": 1.2059289637160362, "kl": 0.38134765625, "learning_rate": 1.5510634396475262e-05, "loss": 0.1622, "reward": 1.9296875, "reward_std": 0.5686949491500854, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.9296875, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 746.2421875, "epoch": 0.3832, "grad_norm": 0.7024114243334588, "kl": 0.27392578125, "learning_rate": 1.54989777222484e-05, "loss": 0.1117, "reward": 1.908203125, "reward_std": 0.5452473387122154, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.939453125, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 704.5, "epoch": 0.3836, "grad_norm": 0.6143605135004417, "kl": 0.33984375, "learning_rate": 1.5487310327483087e-05, "loss": 0.1508, "reward": 1.82421875, "reward_std": 0.5525845661759377, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.91796875, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 704.8828125, "epoch": 0.384, "grad_norm": 0.4904975700724755, "kl": 0.265869140625, "learning_rate": 1.5475632234925505e-05, "loss": 0.0779, "reward": 1.986328125, "reward_std": 0.4164331257343292, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.947265625, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 716.8671875, "epoch": 0.3844, "grad_norm": 0.6956340155820855, "kl": 0.30712890625, "learning_rate": 1.5463943467342694e-05, "loss": 0.0978, "reward": 1.83203125, "reward_std": 0.4301448240876198, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.93359375, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 662.828125, "epoch": 0.3848, "grad_norm": 0.4742497945587886, "kl": 0.23193359375, "learning_rate": 1.5452244047522504e-05, "loss": 0.1276, "reward": 2.17578125, "reward_std": 0.4511345401406288, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 626.8125, "epoch": 0.3852, "grad_norm": 0.9930744932234539, "kl": 0.2001953125, "learning_rate": 1.544053399827355e-05, "loss": 0.0678, "reward": 1.984375, "reward_std": 0.3626062422990799, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 610.7109375, "epoch": 0.3856, "grad_norm": 0.32770832408944, "kl": 0.233154296875, "learning_rate": 1.5428813342425177e-05, "loss": 0.071, "reward": 2.0078125, "reward_std": 0.30909235030412674, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 638.4765625, "epoch": 0.386, "grad_norm": 0.3386175404383212, "kl": 0.228515625, "learning_rate": 1.54170821028274e-05, "loss": 0.1022, "reward": 2.0625, "reward_std": 0.3855194002389908, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 647.078125, "epoch": 0.3864, "grad_norm": 0.8323271827484798, "kl": 0.23828125, "learning_rate": 1.540534030235087e-05, "loss": 0.144, "reward": 1.9375, "reward_std": 0.45662496984004974, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 675.9609375, "epoch": 0.3868, "grad_norm": 1.4623070148227217, "kl": 0.301513671875, "learning_rate": 1.5393587963886837e-05, "loss": 0.1034, "reward": 2.00390625, "reward_std": 0.3244430124759674, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 679.9765625, "epoch": 0.3872, "grad_norm": 0.3200372032439362, "kl": 0.227294921875, "learning_rate": 1.5381825110347072e-05, "loss": 0.0856, "reward": 1.873046875, "reward_std": 0.3978728652000427, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 630.8671875, "epoch": 0.3876, "grad_norm": 0.9693403898824832, "kl": 0.301025390625, "learning_rate": 1.5370051764663872e-05, "loss": 0.0818, "reward": 2.10546875, "reward_std": 0.4268387034535408, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 638.75, "epoch": 0.388, "grad_norm": 0.3979731299424252, "kl": 0.23046875, "learning_rate": 1.5358267949789968e-05, "loss": 0.2002, "reward": 2.009765625, "reward_std": 0.4665352776646614, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.931640625, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 652.1328125, "epoch": 0.3884, "grad_norm": 0.9743067162974394, "kl": 0.279541015625, "learning_rate": 1.5346473688698514e-05, "loss": 0.1042, "reward": 1.9921875, "reward_std": 0.3940025046467781, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 661.6171875, "epoch": 0.3888, "grad_norm": 3.467970204035844, "kl": 0.457275390625, "learning_rate": 1.533466900438303e-05, "loss": 0.0603, "reward": 2.134765625, "reward_std": 0.32155244797468185, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 672.09375, "epoch": 0.3892, "grad_norm": 0.7213559406272447, "kl": 0.2626953125, "learning_rate": 1.532285391985734e-05, "loss": 0.0637, "reward": 2.162109375, "reward_std": 0.2265625, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 721.34375, "epoch": 0.3896, "grad_norm": 0.8907901580878144, "kl": 0.2236328125, "learning_rate": 1.5311028458155567e-05, "loss": 0.0463, "reward": 2.05859375, "reward_std": 0.3846539333462715, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 651.78125, "epoch": 0.39, "grad_norm": 0.5615469742469138, "kl": 0.195068359375, "learning_rate": 1.529919264233205e-05, "loss": 0.036, "reward": 1.916015625, "reward_std": 0.22086668014526367, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.978515625, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 694.5390625, "epoch": 0.3904, "grad_norm": 0.37107810962529764, "kl": 0.19287109375, "learning_rate": 1.528734649546132e-05, "loss": 0.0501, "reward": 2.056640625, "reward_std": 0.30420252680778503, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 604.96875, "epoch": 0.3908, "grad_norm": 1.0358540771088107, "kl": 0.23046875, "learning_rate": 1.5275490040638038e-05, "loss": 0.0242, "reward": 2.166015625, "reward_std": 0.25876642763614655, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 676.3359375, "epoch": 0.3912, "grad_norm": 0.4994844021065932, "kl": 0.20263671875, "learning_rate": 1.526362330097698e-05, "loss": 0.0383, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 585.90625, "epoch": 0.3916, "grad_norm": 0.4573016667430841, "kl": 0.198974609375, "learning_rate": 1.5251746299612959e-05, "loss": 0.0377, "reward": 2.16015625, "reward_std": 0.2196033075451851, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 645.9453125, "epoch": 0.392, "grad_norm": 0.3279921231650484, "kl": 0.193603515625, "learning_rate": 1.5239859059700794e-05, "loss": 0.0581, "reward": 2.22265625, "reward_std": 0.171875, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 687.5859375, "epoch": 0.3924, "grad_norm": 2.0288021630020157, "kl": 0.215576171875, "learning_rate": 1.5227961604415266e-05, "loss": 0.028, "reward": 1.982421875, "reward_std": 0.152098648250103, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 692.421875, "epoch": 0.3928, "grad_norm": 0.5011983264576532, "kl": 0.210693359375, "learning_rate": 1.5216053956951081e-05, "loss": 0.0206, "reward": 2.119140625, "reward_std": 0.26562219113111496, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.986328125, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 649.140625, "epoch": 0.3932, "grad_norm": 0.5188931692763619, "kl": 0.2099609375, "learning_rate": 1.5204136140522799e-05, "loss": 0.0668, "reward": 2.01171875, "reward_std": 0.27289338409900665, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 631.5625, "epoch": 0.3936, "grad_norm": 0.2644191277716184, "kl": 0.23095703125, "learning_rate": 1.5192208178364815e-05, "loss": 0.0198, "reward": 2.12109375, "reward_std": 0.27249573916196823, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 603.265625, "epoch": 0.394, "grad_norm": 0.5016039394444706, "kl": 0.234130859375, "learning_rate": 1.5180270093731305e-05, "loss": 0.1109, "reward": 1.986328125, "reward_std": 0.4078434258699417, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.955078125, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 665.2421875, "epoch": 0.3944, "grad_norm": 0.3648913345762868, "kl": 0.24853515625, "learning_rate": 1.5168321909896171e-05, "loss": 0.0774, "reward": 1.94140625, "reward_std": 0.4130426421761513, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.95703125, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 625.328125, "epoch": 0.3948, "grad_norm": 0.3184845513993417, "kl": 0.222412109375, "learning_rate": 1.5156363650153012e-05, "loss": 0.0184, "reward": 2.048828125, "reward_std": 0.16649089753627777, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 681.609375, "epoch": 0.3952, "grad_norm": 0.2989186035966692, "kl": 0.212158203125, "learning_rate": 1.5144395337815066e-05, "loss": 0.0343, "reward": 2.005859375, "reward_std": 0.2280021756887436, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 615.5, "epoch": 0.3956, "grad_norm": 0.16074840035397905, "kl": 0.234375, "learning_rate": 1.5132416996215171e-05, "loss": 0.0219, "reward": 2.017578125, "reward_std": 0.11058919876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 560.875, "epoch": 0.396, "grad_norm": 0.12000772909449006, "kl": 0.214111328125, "learning_rate": 1.5120428648705716e-05, "loss": 0.015, "reward": 1.966796875, "reward_std": 0.0725904181599617, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 610.375, "epoch": 0.3964, "grad_norm": 0.532913459608869, "kl": 0.2373046875, "learning_rate": 1.51084303186586e-05, "loss": 0.0924, "reward": 1.984375, "reward_std": 0.2785891965031624, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 564.6640625, "epoch": 0.3968, "grad_norm": 0.27028070954946914, "kl": 0.237060546875, "learning_rate": 1.5096422029465178e-05, "loss": 0.0061, "reward": 2.109375, "reward_std": 0.1665782630443573, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 781.1875, "epoch": 0.3972, "grad_norm": 0.28229500102033234, "kl": 0.20703125, "learning_rate": 1.508440380453623e-05, "loss": 0.0241, "reward": 1.970703125, "reward_std": 0.14502985030412674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 689.859375, "epoch": 0.3976, "grad_norm": 0.22438592764008056, "kl": 0.222900390625, "learning_rate": 1.5072375667301893e-05, "loss": 0.0866, "reward": 1.83984375, "reward_std": 0.3226081356406212, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.91796875, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 687.03125, "epoch": 0.398, "grad_norm": 0.3244356449041546, "kl": 0.19677734375, "learning_rate": 1.5060337641211637e-05, "loss": 0.079, "reward": 2.103515625, "reward_std": 0.3106522932648659, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 733.8671875, "epoch": 0.3984, "grad_norm": 0.27417154403906085, "kl": 0.208251953125, "learning_rate": 1.504828974973422e-05, "loss": 0.0409, "reward": 2.0625, "reward_std": 0.2712440490722656, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 745.53125, "epoch": 0.3988, "grad_norm": 0.27772143360142926, "kl": 0.20703125, "learning_rate": 1.503623201635761e-05, "loss": 0.0416, "reward": 2.05859375, "reward_std": 0.23607146739959717, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 760.0078125, "epoch": 0.3992, "grad_norm": 0.6684965526977893, "kl": 0.22509765625, "learning_rate": 1.5024164464588982e-05, "loss": 0.0169, "reward": 2.20703125, "reward_std": 0.3632860332727432, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 646.0703125, "epoch": 0.3996, "grad_norm": 0.11131701455303883, "kl": 0.20068359375, "learning_rate": 1.5012087117954643e-05, "loss": 0.0079, "reward": 2.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 683.375, "epoch": 0.4, "grad_norm": 9.339122067808583, "kl": 0.23193359375, "learning_rate": 1.5000000000000002e-05, "loss": 0.0596, "reward": 1.96875, "reward_std": 0.28229377418756485, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.953125, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 694.2734375, "epoch": 0.4004, "grad_norm": 0.08315617062189738, "kl": 0.18115234375, "learning_rate": 1.498790313428951e-05, "loss": 0.0185, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1001 }, { "clip_ratio": 0.0, "completion_length": 609.6328125, "epoch": 0.4008, "grad_norm": 0.2901265468029777, "kl": 0.179931640625, "learning_rate": 1.4975796544406627e-05, "loss": 0.0494, "reward": 2.03125, "reward_std": 0.27122659236192703, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1002 }, { "clip_ratio": 0.0, "completion_length": 636.234375, "epoch": 0.4012, "grad_norm": 4.087188025407206, "kl": 0.264892578125, "learning_rate": 1.496368025395377e-05, "loss": 0.0156, "reward": 2.087890625, "reward_std": 0.25605712831020355, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1003 }, { "clip_ratio": 0.0, "completion_length": 648.59375, "epoch": 0.4016, "grad_norm": 0.6338915266992169, "kl": 0.20556640625, "learning_rate": 1.4951554286552266e-05, "loss": 0.0965, "reward": 1.8984375, "reward_std": 0.35085874795913696, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1004 }, { "clip_ratio": 0.0, "completion_length": 663.46875, "epoch": 0.402, "grad_norm": 0.21263230739390784, "kl": 0.182861328125, "learning_rate": 1.493941866584231e-05, "loss": 0.0812, "reward": 1.978515625, "reward_std": 0.3093768358230591, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1005 }, { "clip_ratio": 0.0, "completion_length": 606.0703125, "epoch": 0.4024, "grad_norm": 0.2340480092058966, "kl": 0.2001953125, "learning_rate": 1.4927273415482916e-05, "loss": 0.0304, "reward": 2.017578125, "reward_std": 0.14007875323295593, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1006 }, { "clip_ratio": 0.0, "completion_length": 674.8515625, "epoch": 0.4028, "grad_norm": 0.415122565399135, "kl": 0.2294921875, "learning_rate": 1.4915118559151871e-05, "loss": 0.0348, "reward": 1.970703125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1007 }, { "clip_ratio": 0.0, "completion_length": 719.078125, "epoch": 0.4032, "grad_norm": 0.24145357075496662, "kl": 0.20751953125, "learning_rate": 1.4902954120545687e-05, "loss": 0.0149, "reward": 1.994140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1008 }, { "clip_ratio": 0.0, "completion_length": 658.21875, "epoch": 0.4036, "grad_norm": 0.24429973013592168, "kl": 0.221435546875, "learning_rate": 1.4890780123379565e-05, "loss": 0.0642, "reward": 2.052734375, "reward_std": 0.25775300711393356, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1009 }, { "clip_ratio": 0.0, "completion_length": 634.765625, "epoch": 0.404, "grad_norm": 0.3393222562993017, "kl": 0.220947265625, "learning_rate": 1.4878596591387329e-05, "loss": 0.022, "reward": 2.119140625, "reward_std": 0.17132875323295593, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1010 }, { "clip_ratio": 0.0, "completion_length": 686.640625, "epoch": 0.4044, "grad_norm": 0.2742346526799227, "kl": 0.1962890625, "learning_rate": 1.4866403548321402e-05, "loss": 0.0434, "reward": 2.080078125, "reward_std": 0.26751209050416946, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1011 }, { "clip_ratio": 0.0, "completion_length": 700.4609375, "epoch": 0.4048, "grad_norm": 0.26401128852545686, "kl": 0.1845703125, "learning_rate": 1.485420101795274e-05, "loss": 0.0249, "reward": 2.072265625, "reward_std": 0.2153049185872078, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1012 }, { "clip_ratio": 0.0, "completion_length": 693.078125, "epoch": 0.4052, "grad_norm": 0.18051353269922413, "kl": 0.189697265625, "learning_rate": 1.4841989024070809e-05, "loss": 0.0059, "reward": 2.2578125, "reward_std": 0.07394562661647797, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1013 }, { "clip_ratio": 0.0, "completion_length": 810.8671875, "epoch": 0.4056, "grad_norm": 0.1833857842808365, "kl": 0.164306640625, "learning_rate": 1.4829767590483508e-05, "loss": 0.0149, "reward": 2.01171875, "reward_std": 0.11187678575515747, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 1014 }, { "clip_ratio": 0.0, "completion_length": 736.890625, "epoch": 0.406, "grad_norm": 0.3106335367824375, "kl": 0.175048828125, "learning_rate": 1.4817536741017153e-05, "loss": 0.047, "reward": 2.099609375, "reward_std": 0.4215347021818161, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1015 }, { "clip_ratio": 0.0, "completion_length": 748.984375, "epoch": 0.4064, "grad_norm": 2.797737966164207, "kl": 0.185791015625, "learning_rate": 1.4805296499516408e-05, "loss": 0.0419, "reward": 2.16015625, "reward_std": 0.2424129769206047, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 1016 }, { "clip_ratio": 0.0, "completion_length": 764.2421875, "epoch": 0.4068, "grad_norm": 0.4011833635552008, "kl": 0.19677734375, "learning_rate": 1.4793046889844252e-05, "loss": 0.0554, "reward": 1.953125, "reward_std": 0.1588020622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1017 }, { "clip_ratio": 0.0, "completion_length": 738.4765625, "epoch": 0.4072, "grad_norm": 0.7264543254819581, "kl": 0.193603515625, "learning_rate": 1.4780787935881925e-05, "loss": 0.0966, "reward": 1.900390625, "reward_std": 0.35805685818195343, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1018 }, { "clip_ratio": 0.0, "completion_length": 754.9375, "epoch": 0.4076, "grad_norm": 0.2223306094197724, "kl": 0.2060546875, "learning_rate": 1.4768519661528879e-05, "loss": 0.0382, "reward": 2.158203125, "reward_std": 0.21657558530569077, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.986328125, "step": 1019 }, { "clip_ratio": 0.0, "completion_length": 729.9140625, "epoch": 0.408, "grad_norm": 0.3633461923474126, "kl": 0.19775390625, "learning_rate": 1.4756242090702756e-05, "loss": 0.049, "reward": 1.927734375, "reward_std": 0.2291145622730255, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.974609375, "step": 1020 }, { "clip_ratio": 0.0, "completion_length": 727.171875, "epoch": 0.4084, "grad_norm": 0.2824759116054099, "kl": 0.18505859375, "learning_rate": 1.4743955247339292e-05, "loss": 0.0612, "reward": 2.0, "reward_std": 0.3205379769206047, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1021 }, { "clip_ratio": 0.0, "completion_length": 669.6875, "epoch": 0.4088, "grad_norm": 0.1704777303070369, "kl": 0.193359375, "learning_rate": 1.4731659155392332e-05, "loss": 0.063, "reward": 1.978515625, "reward_std": 0.1610720381140709, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1022 }, { "clip_ratio": 0.0, "completion_length": 665.0546875, "epoch": 0.4092, "grad_norm": 0.31352276246512784, "kl": 0.2109375, "learning_rate": 1.4719353838833729e-05, "loss": 0.0742, "reward": 1.982421875, "reward_std": 0.2916145622730255, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1023 }, { "clip_ratio": 0.0, "completion_length": 682.4765625, "epoch": 0.4096, "grad_norm": 0.2812529610678474, "kl": 0.203369140625, "learning_rate": 1.470703932165333e-05, "loss": 0.0591, "reward": 1.935546875, "reward_std": 0.22315485030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1024 }, { "clip_ratio": 0.0, "completion_length": 652.8359375, "epoch": 0.41, "grad_norm": 0.57438580618544, "kl": 0.203125, "learning_rate": 1.469471562785891e-05, "loss": 0.0534, "reward": 2.12109375, "reward_std": 0.21064698696136475, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1025 }, { "clip_ratio": 0.0, "completion_length": 627.0546875, "epoch": 0.4104, "grad_norm": 0.34125728447247716, "kl": 0.203857421875, "learning_rate": 1.4682382781476146e-05, "loss": 0.1308, "reward": 2.0703125, "reward_std": 0.4416191056370735, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1026 }, { "clip_ratio": 0.0, "completion_length": 577.484375, "epoch": 0.4108, "grad_norm": 0.40692171624295514, "kl": 0.183837890625, "learning_rate": 1.4670040806548555e-05, "loss": 0.161, "reward": 2.158203125, "reward_std": 0.4927467182278633, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1027 }, { "clip_ratio": 0.0, "completion_length": 659.3671875, "epoch": 0.4112, "grad_norm": 0.30082219781443625, "kl": 0.204833984375, "learning_rate": 1.4657689727137443e-05, "loss": 0.1209, "reward": 1.90625, "reward_std": 0.48885229229927063, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1028 }, { "clip_ratio": 0.0, "completion_length": 623.984375, "epoch": 0.4116, "grad_norm": 0.20323190338953134, "kl": 0.171630859375, "learning_rate": 1.464532956732188e-05, "loss": 0.0865, "reward": 2.125, "reward_std": 0.25726838409900665, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1029 }, { "clip_ratio": 0.0, "completion_length": 635.4609375, "epoch": 0.412, "grad_norm": 0.7609574338188737, "kl": 0.19482421875, "learning_rate": 1.463296035119862e-05, "loss": 0.1334, "reward": 2.080078125, "reward_std": 0.4267655536532402, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1030 }, { "clip_ratio": 0.0, "completion_length": 647.0234375, "epoch": 0.4124, "grad_norm": 0.6721951130350298, "kl": 0.190185546875, "learning_rate": 1.4620582102882088e-05, "loss": 0.1034, "reward": 1.986328125, "reward_std": 0.36275503039360046, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1031 }, { "clip_ratio": 0.0, "completion_length": 660.296875, "epoch": 0.4128, "grad_norm": 0.17643689651473524, "kl": 0.1953125, "learning_rate": 1.4608194846504311e-05, "loss": 0.0627, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1032 }, { "clip_ratio": 0.0, "completion_length": 709.078125, "epoch": 0.4132, "grad_norm": 0.2550691574123873, "kl": 0.189208984375, "learning_rate": 1.4595798606214882e-05, "loss": 0.1091, "reward": 2.130859375, "reward_std": 0.44809163361787796, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1033 }, { "clip_ratio": 0.0, "completion_length": 607.4375, "epoch": 0.4136, "grad_norm": 1.092974571530933, "kl": 0.24609375, "learning_rate": 1.4583393406180898e-05, "loss": 0.124, "reward": 1.943359375, "reward_std": 0.35223282873630524, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.927734375, "step": 1034 }, { "clip_ratio": 0.0, "completion_length": 665.328125, "epoch": 0.414, "grad_norm": 0.28519990015682767, "kl": 0.187255859375, "learning_rate": 1.4570979270586944e-05, "loss": 0.0866, "reward": 1.99609375, "reward_std": 0.30014359951019287, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1035 }, { "clip_ratio": 0.0, "completion_length": 719.234375, "epoch": 0.4144, "grad_norm": 0.2725875737271672, "kl": 0.17919921875, "learning_rate": 1.4558556223635004e-05, "loss": 0.0678, "reward": 2.00390625, "reward_std": 0.344283826649189, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.95703125, "step": 1036 }, { "clip_ratio": 0.0, "completion_length": 749.15625, "epoch": 0.4148, "grad_norm": 178368.70840890094, "kl": 20352.14697265625, "learning_rate": 1.454612428954444e-05, "loss": 917.6973, "reward": 1.953125, "reward_std": 0.4134148508310318, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9375, "step": 1037 }, { "clip_ratio": 0.0, "completion_length": 784.546875, "epoch": 0.4152, "grad_norm": 0.8075422162426623, "kl": 0.207275390625, "learning_rate": 1.4533683492551954e-05, "loss": 0.0745, "reward": 1.767578125, "reward_std": 0.5069761723279953, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.892578125, "step": 1038 }, { "clip_ratio": 0.0, "completion_length": 771.7578125, "epoch": 0.4156, "grad_norm": 0.4938984098579439, "kl": 0.18603515625, "learning_rate": 1.4521233856911507e-05, "loss": 0.0556, "reward": 1.958984375, "reward_std": 0.34630946815013885, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1039 }, { "clip_ratio": 0.0, "completion_length": 738.859375, "epoch": 0.416, "grad_norm": 0.6087839497625988, "kl": 0.201416015625, "learning_rate": 1.4508775406894308e-05, "loss": 0.0943, "reward": 1.763671875, "reward_std": 0.4774354323744774, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.896484375, "step": 1040 }, { "clip_ratio": 0.0, "completion_length": 739.453125, "epoch": 0.4164, "grad_norm": 0.4634481647765712, "kl": 0.177001953125, "learning_rate": 1.449630816678874e-05, "loss": 0.1449, "reward": 1.68359375, "reward_std": 0.5516878440976143, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.796875, "rewards/tag_count_reward": 0.85546875, "step": 1041 }, { "clip_ratio": 0.0, "completion_length": 659.1796875, "epoch": 0.4168, "grad_norm": 0.2641447106901835, "kl": 0.155517578125, "learning_rate": 1.4483832160900326e-05, "loss": 0.0214, "reward": 2.04296875, "reward_std": 0.24147292971611023, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1042 }, { "clip_ratio": 0.0, "completion_length": 704.578125, "epoch": 0.4172, "grad_norm": 0.6524875278118785, "kl": 0.1982421875, "learning_rate": 1.4471347413551673e-05, "loss": 0.0523, "reward": 2.107421875, "reward_std": 0.2144516110420227, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1043 }, { "clip_ratio": 0.0, "completion_length": 621.2109375, "epoch": 0.4176, "grad_norm": 0.595959392242666, "kl": 0.195068359375, "learning_rate": 1.4458853949082443e-05, "loss": 0.0724, "reward": 2.12109375, "reward_std": 0.18332062661647797, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1044 }, { "clip_ratio": 0.0, "completion_length": 681.171875, "epoch": 0.418, "grad_norm": 0.22794069575828058, "kl": 0.174560546875, "learning_rate": 1.4446351791849276e-05, "loss": 0.0213, "reward": 2.30859375, "reward_std": 0.18195747584104538, "rewards/accuracy_reward": 0.3359375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1045 }, { "clip_ratio": 0.0, "completion_length": 644.7890625, "epoch": 0.4184, "grad_norm": 0.22413786458915286, "kl": 0.152099609375, "learning_rate": 1.4433840966225772e-05, "loss": 0.0435, "reward": 2.123046875, "reward_std": 0.2769899070262909, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1046 }, { "clip_ratio": 0.0, "completion_length": 779.0234375, "epoch": 0.4188, "grad_norm": 0.2895160404074528, "kl": 0.18310546875, "learning_rate": 1.4421321496602428e-05, "loss": 0.0414, "reward": 1.90234375, "reward_std": 0.29965703189373016, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.96484375, "step": 1047 }, { "clip_ratio": 0.0, "completion_length": 733.3046875, "epoch": 0.4192, "grad_norm": 0.223590081040378, "kl": 0.17626953125, "learning_rate": 1.4408793407386587e-05, "loss": 0.0058, "reward": 2.056640625, "reward_std": 0.2949574738740921, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.955078125, "step": 1048 }, { "clip_ratio": 0.0, "completion_length": 620.0, "epoch": 0.4196, "grad_norm": 0.2636208675252569, "kl": 0.183837890625, "learning_rate": 1.43962567230024e-05, "loss": 0.0366, "reward": 2.275390625, "reward_std": 0.23020051419734955, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.994140625, "step": 1049 }, { "clip_ratio": 0.0, "completion_length": 721.046875, "epoch": 0.42, "grad_norm": 0.16703736146085837, "kl": 0.191650390625, "learning_rate": 1.4383711467890776e-05, "loss": 0.0225, "reward": 2.00390625, "reward_std": 0.10375864803791046, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 1050 }, { "clip_ratio": 0.0, "completion_length": 715.875, "epoch": 0.4204, "grad_norm": 0.36509143755881374, "kl": 0.17431640625, "learning_rate": 1.437115766650933e-05, "loss": 0.0192, "reward": 1.982421875, "reward_std": 0.17979396134614944, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1051 }, { "clip_ratio": 0.0, "completion_length": 681.109375, "epoch": 0.4208, "grad_norm": 0.20963770745578833, "kl": 0.199462890625, "learning_rate": 1.4358595343332342e-05, "loss": 0.0059, "reward": 1.935546875, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.982421875, "step": 1052 }, { "clip_ratio": 0.0, "completion_length": 676.734375, "epoch": 0.4212, "grad_norm": 0.3636252367595005, "kl": 0.20703125, "learning_rate": 1.4346024522850704e-05, "loss": 0.0458, "reward": 2.001953125, "reward_std": 0.2572639063000679, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1053 }, { "clip_ratio": 0.0, "completion_length": 724.1171875, "epoch": 0.4216, "grad_norm": 0.22147598420538714, "kl": 0.178466796875, "learning_rate": 1.4333445229571874e-05, "loss": 0.0167, "reward": 2.009765625, "reward_std": 0.12863312661647797, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1054 }, { "clip_ratio": 0.0, "completion_length": 683.59375, "epoch": 0.422, "grad_norm": 0.1559362869734904, "kl": 0.193115234375, "learning_rate": 1.4320857488019826e-05, "loss": 0.0668, "reward": 2.052734375, "reward_std": 0.18561824411153793, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1055 }, { "clip_ratio": 0.0, "completion_length": 758.453125, "epoch": 0.4224, "grad_norm": 0.20117010699492008, "kl": 0.17041015625, "learning_rate": 1.4308261322735006e-05, "loss": 0.0198, "reward": 1.978515625, "reward_std": 0.1882016956806183, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.986328125, "step": 1056 }, { "clip_ratio": 0.0, "completion_length": 768.3828125, "epoch": 0.4228, "grad_norm": 0.27210744014418603, "kl": 0.1845703125, "learning_rate": 1.4295656758274283e-05, "loss": 0.0337, "reward": 1.984375, "reward_std": 0.18787646293640137, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1057 }, { "clip_ratio": 0.0, "completion_length": 667.9609375, "epoch": 0.4232, "grad_norm": 0.16954383618681038, "kl": 0.190673828125, "learning_rate": 1.4283043819210905e-05, "loss": 0.0171, "reward": 1.990234375, "reward_std": 0.14202880859375, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1058 }, { "clip_ratio": 0.0, "completion_length": 753.0703125, "epoch": 0.4236, "grad_norm": 0.6628887037668086, "kl": 0.176513671875, "learning_rate": 1.4270422530134433e-05, "loss": 0.0121, "reward": 2.19921875, "reward_std": 0.2179437279701233, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 1059 }, { "clip_ratio": 0.0, "completion_length": 716.4921875, "epoch": 0.424, "grad_norm": 0.15828712463199882, "kl": 0.192626953125, "learning_rate": 1.4257792915650728e-05, "loss": 0.0466, "reward": 1.95703125, "reward_std": 0.21320747584104538, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1060 }, { "clip_ratio": 0.0, "completion_length": 711.1640625, "epoch": 0.4244, "grad_norm": 0.16110450986485184, "kl": 0.19287109375, "learning_rate": 1.424515500038186e-05, "loss": 0.0262, "reward": 2.119140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1061 }, { "clip_ratio": 0.0, "completion_length": 754.734375, "epoch": 0.4248, "grad_norm": 0.24998948873721652, "kl": 0.185791015625, "learning_rate": 1.4232508808966097e-05, "loss": 0.0402, "reward": 2.033203125, "reward_std": 0.2613266110420227, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1062 }, { "clip_ratio": 0.0, "completion_length": 705.6875, "epoch": 0.4252, "grad_norm": 0.15147388009475252, "kl": 0.1845703125, "learning_rate": 1.4219854366057831e-05, "loss": 0.0133, "reward": 2.015625, "reward_std": 0.0625, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1063 }, { "clip_ratio": 0.0, "completion_length": 705.875, "epoch": 0.4256, "grad_norm": 0.1847032672511981, "kl": 0.187744140625, "learning_rate": 1.420719169632755e-05, "loss": 0.0289, "reward": 2.029296875, "reward_std": 0.19156966358423233, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1064 }, { "clip_ratio": 0.0, "completion_length": 693.765625, "epoch": 0.426, "grad_norm": 0.20303565252098588, "kl": 0.18359375, "learning_rate": 1.4194520824461773e-05, "loss": 0.0356, "reward": 2.060546875, "reward_std": 0.27610156685113907, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1065 }, { "clip_ratio": 0.0, "completion_length": 688.6640625, "epoch": 0.4264, "grad_norm": 0.1011295738763217, "kl": 0.1923828125, "learning_rate": 1.4181841775163014e-05, "loss": 0.0135, "reward": 1.966796875, "reward_std": 0.0725904181599617, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1066 }, { "clip_ratio": 0.0, "completion_length": 673.4609375, "epoch": 0.4268, "grad_norm": 0.21170842026402817, "kl": 0.166015625, "learning_rate": 1.4169154573149737e-05, "loss": 0.02, "reward": 2.134765625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1067 }, { "clip_ratio": 0.0, "completion_length": 701.28125, "epoch": 0.4272, "grad_norm": 0.18898651366825212, "kl": 0.19091796875, "learning_rate": 1.415645924315628e-05, "loss": 0.0341, "reward": 2.095703125, "reward_std": 0.1509895622730255, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1068 }, { "clip_ratio": 0.0, "completion_length": 646.296875, "epoch": 0.4276, "grad_norm": 0.26183049250559776, "kl": 0.181884765625, "learning_rate": 1.4143755809932843e-05, "loss": 0.0329, "reward": 2.0546875, "reward_std": 0.2008213996887207, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1069 }, { "clip_ratio": 0.0, "completion_length": 673.5078125, "epoch": 0.428, "grad_norm": 0.16676188564067726, "kl": 0.194580078125, "learning_rate": 1.413104429824542e-05, "loss": 0.0297, "reward": 2.005859375, "reward_std": 0.08175812661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1070 }, { "clip_ratio": 0.0, "completion_length": 700.1328125, "epoch": 0.4284, "grad_norm": 0.228434273548051, "kl": 0.181396484375, "learning_rate": 1.411832473287575e-05, "loss": 0.0087, "reward": 2.09375, "reward_std": 0.14994097501039505, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1071 }, { "clip_ratio": 0.0, "completion_length": 721.6328125, "epoch": 0.4288, "grad_norm": 0.24772605530194386, "kl": 0.166259765625, "learning_rate": 1.4105597138621281e-05, "loss": 0.0085, "reward": 2.1640625, "reward_std": 0.21875, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1072 }, { "clip_ratio": 0.0, "completion_length": 687.9765625, "epoch": 0.4292, "grad_norm": 0.09269145937540813, "kl": 0.17822265625, "learning_rate": 1.4092861540295109e-05, "loss": 0.0436, "reward": 1.97265625, "reward_std": 0.07471735030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1073 }, { "clip_ratio": 0.0, "completion_length": 723.796875, "epoch": 0.4296, "grad_norm": 0.15980920551683278, "kl": 0.180419921875, "learning_rate": 1.4080117962725929e-05, "loss": -0.0003, "reward": 2.1640625, "reward_std": 0.059839196503162384, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1074 }, { "clip_ratio": 0.0, "completion_length": 698.7265625, "epoch": 0.43, "grad_norm": 0.2668799824697938, "kl": 0.179931640625, "learning_rate": 1.4067366430758004e-05, "loss": 0.044, "reward": 2.046875, "reward_std": 0.15779343992471695, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1075 }, { "clip_ratio": 0.0, "completion_length": 713.1171875, "epoch": 0.4304, "grad_norm": 0.20488523710178222, "kl": 0.176513671875, "learning_rate": 1.4054606969251095e-05, "loss": 0.0392, "reward": 2.01953125, "reward_std": 0.20797232538461685, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1076 }, { "clip_ratio": 0.0, "completion_length": 714.34375, "epoch": 0.4308, "grad_norm": 0.13797564807385787, "kl": 0.185302734375, "learning_rate": 1.4041839603080423e-05, "loss": 0.0189, "reward": 2.0234375, "reward_std": 0.07394562661647797, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1077 }, { "clip_ratio": 0.0, "completion_length": 664.4609375, "epoch": 0.4312, "grad_norm": 0.07591230577981906, "kl": 0.17529296875, "learning_rate": 1.4029064357136628e-05, "loss": 0.0348, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1078 }, { "clip_ratio": 0.0, "completion_length": 713.09375, "epoch": 0.4316, "grad_norm": 0.16757906649742407, "kl": 0.177978515625, "learning_rate": 1.4016281256325702e-05, "loss": 0.0441, "reward": 1.98046875, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1079 }, { "clip_ratio": 0.0, "completion_length": 743.109375, "epoch": 0.432, "grad_norm": 0.24440226665859022, "kl": 0.212890625, "learning_rate": 1.4003490325568953e-05, "loss": 0.038, "reward": 1.998046875, "reward_std": 0.35222648829221725, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1080 }, { "clip_ratio": 0.0, "completion_length": 735.609375, "epoch": 0.4324, "grad_norm": 0.2215022258944667, "kl": 0.189697265625, "learning_rate": 1.3990691589802955e-05, "loss": 0.0557, "reward": 1.98046875, "reward_std": 0.25010646134614944, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1081 }, { "clip_ratio": 0.0, "completion_length": 697.15625, "epoch": 0.4328, "grad_norm": 0.16615695257101293, "kl": 0.160888671875, "learning_rate": 1.39778850739795e-05, "loss": 0.0349, "reward": 1.97265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1082 }, { "clip_ratio": 0.0, "completion_length": 757.2421875, "epoch": 0.4332, "grad_norm": 0.21681224208733363, "kl": 0.1796875, "learning_rate": 1.3965070803065543e-05, "loss": 0.0701, "reward": 2.0, "reward_std": 0.3250408098101616, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1083 }, { "clip_ratio": 0.0, "completion_length": 756.8359375, "epoch": 0.4336, "grad_norm": 0.25955247912709717, "kl": 0.1728515625, "learning_rate": 1.3952248802043166e-05, "loss": 0.0508, "reward": 1.986328125, "reward_std": 0.43488942086696625, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.955078125, "step": 1084 }, { "clip_ratio": 0.0, "completion_length": 661.1640625, "epoch": 0.434, "grad_norm": 0.17809957592682119, "kl": 0.183837890625, "learning_rate": 1.3939419095909513e-05, "loss": 0.0563, "reward": 2.021484375, "reward_std": 0.20413947850465775, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1085 }, { "clip_ratio": 0.0, "completion_length": 756.75, "epoch": 0.4344, "grad_norm": 0.2690542432549811, "kl": 0.169921875, "learning_rate": 1.3926581709676752e-05, "loss": 0.042, "reward": 1.97265625, "reward_std": 0.25590310245752335, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1086 }, { "clip_ratio": 0.0, "completion_length": 650.75, "epoch": 0.4348, "grad_norm": 0.28085192258670455, "kl": 0.139892578125, "learning_rate": 1.3913736668372027e-05, "loss": 0.0802, "reward": 2.123046875, "reward_std": 0.2772243842482567, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1087 }, { "clip_ratio": 0.0, "completion_length": 723.4140625, "epoch": 0.4352, "grad_norm": 0.2127218331748334, "kl": 0.176025390625, "learning_rate": 1.3900883997037398e-05, "loss": 0.0402, "reward": 2.083984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1088 }, { "clip_ratio": 0.0, "completion_length": 756.2890625, "epoch": 0.4356, "grad_norm": 10.045836809461418, "kl": 0.193115234375, "learning_rate": 1.388802372072981e-05, "loss": 0.0402, "reward": 1.9921875, "reward_std": 0.23133426159620285, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1089 }, { "clip_ratio": 0.0, "completion_length": 703.515625, "epoch": 0.436, "grad_norm": 0.19201280603975746, "kl": 0.171630859375, "learning_rate": 1.3875155864521031e-05, "loss": 0.0355, "reward": 2.029296875, "reward_std": 0.20693126320838928, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 1090 }, { "clip_ratio": 0.0, "completion_length": 737.3125, "epoch": 0.4364, "grad_norm": 0.3318661337778714, "kl": 0.185791015625, "learning_rate": 1.3862280453497601e-05, "loss": 0.0549, "reward": 2.169921875, "reward_std": 0.3836240842938423, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1091 }, { "clip_ratio": 0.0, "completion_length": 761.96875, "epoch": 0.4368, "grad_norm": 0.2585477669921002, "kl": 0.181396484375, "learning_rate": 1.3849397512760797e-05, "loss": 0.0373, "reward": 1.958984375, "reward_std": 0.2684025391936302, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1092 }, { "clip_ratio": 0.0, "completion_length": 644.9765625, "epoch": 0.4372, "grad_norm": 0.18285464639159127, "kl": 0.1474609375, "learning_rate": 1.3836507067426565e-05, "loss": 0.0033, "reward": 2.07421875, "reward_std": 0.13894016295671463, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 1093 }, { "clip_ratio": 0.0, "completion_length": 606.796875, "epoch": 0.4376, "grad_norm": 0.2474238377730477, "kl": 0.156982421875, "learning_rate": 1.3823609142625492e-05, "loss": 0.0562, "reward": 2.0546875, "reward_std": 0.15625, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1094 }, { "clip_ratio": 0.0, "completion_length": 749.140625, "epoch": 0.438, "grad_norm": 0.29813145600121205, "kl": 0.178466796875, "learning_rate": 1.3810703763502744e-05, "loss": 0.0705, "reward": 2.107421875, "reward_std": 0.39494597166776657, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1095 }, { "clip_ratio": 0.0, "completion_length": 612.90625, "epoch": 0.4384, "grad_norm": 0.2538952357098445, "kl": 0.18798828125, "learning_rate": 1.3797790955218014e-05, "loss": 0.1165, "reward": 2.0546875, "reward_std": 0.2564745992422104, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1096 }, { "clip_ratio": 0.0, "completion_length": 633.109375, "epoch": 0.4388, "grad_norm": 0.11111954277133089, "kl": 0.180908203125, "learning_rate": 1.3784870742945482e-05, "loss": 0.0847, "reward": 2.03515625, "reward_std": 0.14945197850465775, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1097 }, { "clip_ratio": 0.0, "completion_length": 646.25, "epoch": 0.4392, "grad_norm": 0.14224978318984852, "kl": 0.176025390625, "learning_rate": 1.3771943151873768e-05, "loss": 0.0572, "reward": 2.083984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1098 }, { "clip_ratio": 0.0, "completion_length": 697.390625, "epoch": 0.4396, "grad_norm": 0.08272602665192713, "kl": 0.1650390625, "learning_rate": 1.3759008207205869e-05, "loss": 0.0346, "reward": 1.98046875, "reward_std": 0.11945747584104538, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1099 }, { "clip_ratio": 0.0, "completion_length": 634.6875, "epoch": 0.44, "grad_norm": 0.1658453902510875, "kl": 0.155029296875, "learning_rate": 1.3746065934159123e-05, "loss": 0.0069, "reward": 2.076171875, "reward_std": 0.10968904942274094, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1100 }, { "clip_ratio": 0.0, "completion_length": 678.390625, "epoch": 0.4404, "grad_norm": 0.2857524838307488, "kl": 0.169677734375, "learning_rate": 1.373311635796515e-05, "loss": 0.061, "reward": 2.1796875, "reward_std": 0.355214461684227, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1101 }, { "clip_ratio": 0.0, "completion_length": 611.78125, "epoch": 0.4408, "grad_norm": 0.286624422794285, "kl": 0.16748046875, "learning_rate": 1.3720159503869816e-05, "loss": 0.0308, "reward": 2.134765625, "reward_std": 0.23292839527130127, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1102 }, { "clip_ratio": 0.0, "completion_length": 793.296875, "epoch": 0.4412, "grad_norm": 0.17277100519641816, "kl": 0.158203125, "learning_rate": 1.3707195397133165e-05, "loss": 0.0216, "reward": 1.982421875, "reward_std": 0.15075266361236572, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1103 }, { "clip_ratio": 0.0, "completion_length": 693.5234375, "epoch": 0.4416, "grad_norm": 0.5890189622957084, "kl": 0.17724609375, "learning_rate": 1.3694224063029396e-05, "loss": 0.0414, "reward": 1.96875, "reward_std": 0.20149768888950348, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1104 }, { "clip_ratio": 0.0, "completion_length": 640.46875, "epoch": 0.442, "grad_norm": 0.30840536767633203, "kl": 0.159423828125, "learning_rate": 1.3681245526846782e-05, "loss": 0.1023, "reward": 2.068359375, "reward_std": 0.37843698263168335, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1105 }, { "clip_ratio": 0.0, "completion_length": 716.1015625, "epoch": 0.4424, "grad_norm": 0.26406277919899035, "kl": 0.164794921875, "learning_rate": 1.3668259813887644e-05, "loss": 0.0238, "reward": 2.130859375, "reward_std": 0.30848661065101624, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1106 }, { "clip_ratio": 0.0, "completion_length": 721.515625, "epoch": 0.4428, "grad_norm": 0.10124429883374532, "kl": 0.17138671875, "learning_rate": 1.365526694946829e-05, "loss": 0.0116, "reward": 1.91796875, "reward_std": 0.08605579286813736, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.98046875, "step": 1107 }, { "clip_ratio": 0.0, "completion_length": 709.8671875, "epoch": 0.4432, "grad_norm": 0.24149816017011608, "kl": 0.1494140625, "learning_rate": 1.3642266958918985e-05, "loss": 0.053, "reward": 2.095703125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1108 }, { "clip_ratio": 0.0, "completion_length": 730.2109375, "epoch": 0.4436, "grad_norm": 0.24884237190027025, "kl": 0.156005859375, "learning_rate": 1.3629259867583864e-05, "loss": 0.0864, "reward": 1.9453125, "reward_std": 0.3366374969482422, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1109 }, { "clip_ratio": 0.0, "completion_length": 679.953125, "epoch": 0.444, "grad_norm": 0.2895052134406492, "kl": 0.153076171875, "learning_rate": 1.3616245700820922e-05, "loss": 0.0133, "reward": 2.240234375, "reward_std": 0.24458424746990204, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1110 }, { "clip_ratio": 0.0, "completion_length": 597.8515625, "epoch": 0.4444, "grad_norm": 0.17405182247309833, "kl": 0.171630859375, "learning_rate": 1.3603224484001949e-05, "loss": 0.0963, "reward": 2.220703125, "reward_std": 0.2793978899717331, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1111 }, { "clip_ratio": 0.0, "completion_length": 656.6015625, "epoch": 0.4448, "grad_norm": 0.1300440102265719, "kl": 0.16455078125, "learning_rate": 1.3590196242512463e-05, "loss": 0.0338, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1112 }, { "clip_ratio": 0.0, "completion_length": 689.3515625, "epoch": 0.4452, "grad_norm": 0.22131042892176858, "kl": 0.154296875, "learning_rate": 1.3577161001751696e-05, "loss": 0.0304, "reward": 1.9140625, "reward_std": 0.25276806205511093, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1113 }, { "clip_ratio": 0.0, "completion_length": 698.65625, "epoch": 0.4456, "grad_norm": 0.3484430386345801, "kl": 0.19189453125, "learning_rate": 1.3564118787132507e-05, "loss": 0.0725, "reward": 2.013671875, "reward_std": 0.31918085366487503, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1114 }, { "clip_ratio": 0.0, "completion_length": 686.921875, "epoch": 0.446, "grad_norm": 0.2019691540061367, "kl": 0.158203125, "learning_rate": 1.3551069624081372e-05, "loss": 0.0535, "reward": 1.96875, "reward_std": 0.23019562661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1115 }, { "clip_ratio": 0.0, "completion_length": 613.7109375, "epoch": 0.4464, "grad_norm": 0.2817370603318587, "kl": 0.171142578125, "learning_rate": 1.3538013538038295e-05, "loss": -0.0014, "reward": 2.3046875, "reward_std": 0.18483919650316238, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1116 }, { "clip_ratio": 0.0, "completion_length": 701.2578125, "epoch": 0.4468, "grad_norm": 0.21749592847786764, "kl": 0.1826171875, "learning_rate": 1.3524950554456786e-05, "loss": 0.0322, "reward": 2.0625, "reward_std": 0.20048906654119492, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1117 }, { "clip_ratio": 0.0, "completion_length": 626.4921875, "epoch": 0.4472, "grad_norm": 0.13357811652970372, "kl": 0.156494140625, "learning_rate": 1.3511880698803801e-05, "loss": 0.0455, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1118 }, { "clip_ratio": 0.0, "completion_length": 663.0, "epoch": 0.4476, "grad_norm": 0.1730964640566957, "kl": 0.15478515625, "learning_rate": 1.349880399655969e-05, "loss": 0.0205, "reward": 2.1015625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1119 }, { "clip_ratio": 0.0, "completion_length": 682.9453125, "epoch": 0.448, "grad_norm": 0.24603325031865625, "kl": 0.1630859375, "learning_rate": 1.3485720473218153e-05, "loss": 0.0282, "reward": 2.12109375, "reward_std": 0.12510646134614944, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1120 }, { "clip_ratio": 0.0, "completion_length": 728.0703125, "epoch": 0.4484, "grad_norm": 0.16510178619752652, "kl": 0.16015625, "learning_rate": 1.347263015428619e-05, "loss": 0.0197, "reward": 2.107421875, "reward_std": 0.17979396134614944, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1121 }, { "clip_ratio": 0.0, "completion_length": 671.625, "epoch": 0.4488, "grad_norm": 0.2238965298281718, "kl": 0.15625, "learning_rate": 1.3459533065284049e-05, "loss": 0.0508, "reward": 2.03515625, "reward_std": 0.2085021734237671, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1122 }, { "clip_ratio": 0.0, "completion_length": 738.3828125, "epoch": 0.4492, "grad_norm": 0.22272312512571574, "kl": 0.17529296875, "learning_rate": 1.344642923174517e-05, "loss": 0.0755, "reward": 1.986328125, "reward_std": 0.358421728014946, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1123 }, { "clip_ratio": 0.0, "completion_length": 676.7890625, "epoch": 0.4496, "grad_norm": 0.1846391020600253, "kl": 0.154541015625, "learning_rate": 1.3433318679216154e-05, "loss": 0.0155, "reward": 2.0, "reward_std": 0.21155157685279846, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1124 }, { "clip_ratio": 0.0, "completion_length": 745.375, "epoch": 0.45, "grad_norm": 0.2022146934933656, "kl": 0.15234375, "learning_rate": 1.342020143325669e-05, "loss": 0.0084, "reward": 2.109375, "reward_std": 0.1752614639699459, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 1125 }, { "clip_ratio": 0.0, "completion_length": 667.234375, "epoch": 0.4504, "grad_norm": 0.22022411686865412, "kl": 0.16796875, "learning_rate": 1.340707751943952e-05, "loss": 0.0113, "reward": 2.203125, "reward_std": 0.15292393416166306, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1126 }, { "clip_ratio": 0.0, "completion_length": 711.484375, "epoch": 0.4508, "grad_norm": 0.2748986711105647, "kl": 0.175537109375, "learning_rate": 1.3393946963350381e-05, "loss": 0.0573, "reward": 1.98828125, "reward_std": 0.22875864803791046, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 1127 }, { "clip_ratio": 0.0, "completion_length": 765.171875, "epoch": 0.4512, "grad_norm": 0.17538452230405063, "kl": 0.15380859375, "learning_rate": 1.3380809790587975e-05, "loss": 0.0256, "reward": 1.96875, "reward_std": 0.14456743001937866, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1128 }, { "clip_ratio": 0.0, "completion_length": 665.03125, "epoch": 0.4516, "grad_norm": 0.2849808849667467, "kl": 0.17626953125, "learning_rate": 1.3367666026763884e-05, "loss": 0.1023, "reward": 2.05078125, "reward_std": 0.3142121434211731, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1129 }, { "clip_ratio": 0.0, "completion_length": 704.328125, "epoch": 0.452, "grad_norm": 0.26445222262920076, "kl": 0.167724609375, "learning_rate": 1.3354515697502552e-05, "loss": 0.0548, "reward": 1.970703125, "reward_std": 0.28815338015556335, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1130 }, { "clip_ratio": 0.0, "completion_length": 731.4921875, "epoch": 0.4524, "grad_norm": 0.18892167200002116, "kl": 0.160888671875, "learning_rate": 1.3341358828441217e-05, "loss": 0.0359, "reward": 2.248046875, "reward_std": 0.26523417234420776, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1131 }, { "clip_ratio": 0.0, "completion_length": 685.8515625, "epoch": 0.4528, "grad_norm": 0.2523787636661126, "kl": 0.153076171875, "learning_rate": 1.3328195445229869e-05, "loss": 0.0175, "reward": 2.095703125, "reward_std": 0.1363266110420227, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1132 }, { "clip_ratio": 0.0, "completion_length": 673.1640625, "epoch": 0.4532, "grad_norm": 0.1739188348772286, "kl": 0.160888671875, "learning_rate": 1.3315025573531198e-05, "loss": 0.0348, "reward": 2.1015625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1133 }, { "clip_ratio": 0.0, "completion_length": 757.359375, "epoch": 0.4536, "grad_norm": 0.17628628770369936, "kl": 0.14892578125, "learning_rate": 1.3301849239020537e-05, "loss": 0.0142, "reward": 1.99609375, "reward_std": 0.09606516361236572, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.99609375, "step": 1134 }, { "clip_ratio": 0.0, "completion_length": 621.1015625, "epoch": 0.454, "grad_norm": 0.16912073986017087, "kl": 0.1619873046875, "learning_rate": 1.3288666467385834e-05, "loss": 0.01, "reward": 2.1640625, "reward_std": 0.08715169876813889, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1135 }, { "clip_ratio": 0.0, "completion_length": 786.609375, "epoch": 0.4544, "grad_norm": 0.2668033743522439, "kl": 0.153564453125, "learning_rate": 1.327547728432757e-05, "loss": 0.0328, "reward": 2.15234375, "reward_std": 0.3090071976184845, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1136 }, { "clip_ratio": 0.0, "completion_length": 679.296875, "epoch": 0.4548, "grad_norm": 0.18031440072635804, "kl": 0.166015625, "learning_rate": 1.3262281715558736e-05, "loss": -0.0008, "reward": 2.1328125, "reward_std": 0.03125, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1137 }, { "clip_ratio": 0.0, "completion_length": 708.140625, "epoch": 0.4552, "grad_norm": 0.15915048622514397, "kl": 0.1630859375, "learning_rate": 1.3249079786804765e-05, "loss": 0.0322, "reward": 1.99609375, "reward_std": 0.1597641110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1138 }, { "clip_ratio": 0.0, "completion_length": 704.4140625, "epoch": 0.4556, "grad_norm": 0.23617230447981222, "kl": 0.1533203125, "learning_rate": 1.3235871523803496e-05, "loss": 0.0402, "reward": 1.990234375, "reward_std": 0.21104396134614944, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1139 }, { "clip_ratio": 0.0, "completion_length": 743.6171875, "epoch": 0.456, "grad_norm": 0.3349003175563944, "kl": 0.1630859375, "learning_rate": 1.3222656952305113e-05, "loss": 0.0624, "reward": 2.107421875, "reward_std": 0.4114105850458145, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1140 }, { "clip_ratio": 0.0, "completion_length": 722.15625, "epoch": 0.4564, "grad_norm": 0.19976090285080716, "kl": 0.153076171875, "learning_rate": 1.3209436098072095e-05, "loss": 0.0075, "reward": 2.2421875, "reward_std": 0.14965169876813889, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1141 }, { "clip_ratio": 0.0, "completion_length": 746.8125, "epoch": 0.4568, "grad_norm": 0.36801250433752564, "kl": 0.171875, "learning_rate": 1.319620898687918e-05, "loss": 0.0567, "reward": 1.986328125, "reward_std": 0.31157267838716507, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1142 }, { "clip_ratio": 0.0, "completion_length": 647.90625, "epoch": 0.4572, "grad_norm": 0.2124553477167372, "kl": 0.16552734375, "learning_rate": 1.3182975644513296e-05, "loss": 0.0173, "reward": 1.994140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1143 }, { "clip_ratio": 0.0, "completion_length": 740.5, "epoch": 0.4576, "grad_norm": 0.21024222462799988, "kl": 0.137451171875, "learning_rate": 1.316973609677352e-05, "loss": 0.0403, "reward": 2.234375, "reward_std": 0.2685217931866646, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1144 }, { "clip_ratio": 0.0, "completion_length": 633.5859375, "epoch": 0.458, "grad_norm": 0.44007068655921466, "kl": 0.1748046875, "learning_rate": 1.3156490369471026e-05, "loss": 0.0478, "reward": 2.21484375, "reward_std": 0.3144483342766762, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1145 }, { "clip_ratio": 0.0, "completion_length": 742.28125, "epoch": 0.4584, "grad_norm": 0.27276234513775877, "kl": 0.169921875, "learning_rate": 1.3143238488429042e-05, "loss": 0.0822, "reward": 2.208984375, "reward_std": 0.40635228157043457, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1146 }, { "clip_ratio": 0.0, "completion_length": 684.609375, "epoch": 0.4588, "grad_norm": 0.20890154686376963, "kl": 0.17333984375, "learning_rate": 1.3129980479482783e-05, "loss": 0.0629, "reward": 2.083984375, "reward_std": 0.2490832358598709, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1147 }, { "clip_ratio": 0.0, "completion_length": 721.140625, "epoch": 0.4592, "grad_norm": 0.23310343879178658, "kl": 0.15380859375, "learning_rate": 1.3116716368479418e-05, "loss": 0.0352, "reward": 2.138671875, "reward_std": 0.2517440393567085, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1148 }, { "clip_ratio": 0.0, "completion_length": 703.5546875, "epoch": 0.4596, "grad_norm": 0.30866900291887733, "kl": 0.146484375, "learning_rate": 1.3103446181278015e-05, "loss": 0.0773, "reward": 1.912109375, "reward_std": 0.29136117547750473, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 1149 }, { "clip_ratio": 0.0, "completion_length": 694.5234375, "epoch": 0.46, "grad_norm": 0.35998968504225054, "kl": 0.164794921875, "learning_rate": 1.3090169943749475e-05, "loss": 0.0225, "reward": 2.02734375, "reward_std": 0.21190982311964035, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1150 }, { "clip_ratio": 0.0, "completion_length": 715.25, "epoch": 0.4604, "grad_norm": 1.3941778378115282, "kl": 0.1640625, "learning_rate": 1.3076887681776509e-05, "loss": 0.0874, "reward": 2.037109375, "reward_std": 0.3663008362054825, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 1151 }, { "clip_ratio": 0.0, "completion_length": 703.3046875, "epoch": 0.4608, "grad_norm": 0.2823121427868464, "kl": 0.1650390625, "learning_rate": 1.306359942125356e-05, "loss": 0.1307, "reward": 1.8671875, "reward_std": 0.5328136160969734, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.890625, "step": 1152 }, { "clip_ratio": 0.0, "completion_length": 779.8828125, "epoch": 0.4612, "grad_norm": 0.3801240116484708, "kl": 0.16552734375, "learning_rate": 1.3050305188086778e-05, "loss": 0.1022, "reward": 1.98046875, "reward_std": 0.5505883172154427, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.89453125, "step": 1153 }, { "clip_ratio": 0.0, "completion_length": 848.0234375, "epoch": 0.4616, "grad_norm": 0.28560258780737735, "kl": 0.1484375, "learning_rate": 1.3037005008193944e-05, "loss": 0.0701, "reward": 1.78515625, "reward_std": 0.4895561933517456, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.92578125, "step": 1154 }, { "clip_ratio": 0.0, "completion_length": 874.046875, "epoch": 0.462, "grad_norm": 0.4457876103782754, "kl": 0.18896484375, "learning_rate": 1.3023698907504447e-05, "loss": 0.1075, "reward": 1.580078125, "reward_std": 0.6936578154563904, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.6640625, "rewards/tag_count_reward": 0.791015625, "step": 1155 }, { "clip_ratio": 0.0, "completion_length": 793.296875, "epoch": 0.4624, "grad_norm": 0.391742360115515, "kl": 0.17919921875, "learning_rate": 1.3010386911959207e-05, "loss": 0.0848, "reward": 1.810546875, "reward_std": 0.4504059851169586, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.880859375, "step": 1156 }, { "clip_ratio": 0.0, "completion_length": 740.46875, "epoch": 0.4628, "grad_norm": 0.4048740355904904, "kl": 0.1767578125, "learning_rate": 1.299706904751064e-05, "loss": 0.0849, "reward": 1.84375, "reward_std": 0.5248775109648705, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.9140625, "step": 1157 }, { "clip_ratio": 0.0, "completion_length": 793.890625, "epoch": 0.4632, "grad_norm": 0.4341376707014951, "kl": 0.1640625, "learning_rate": 1.2983745340122604e-05, "loss": 0.1195, "reward": 1.80859375, "reward_std": 0.6068409383296967, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.91015625, "step": 1158 }, { "clip_ratio": 0.0, "completion_length": 832.875, "epoch": 0.4636, "grad_norm": 0.27132418274999737, "kl": 0.137451171875, "learning_rate": 1.297041581577035e-05, "loss": 0.0762, "reward": 1.740234375, "reward_std": 0.5441596582531929, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.912109375, "step": 1159 }, { "clip_ratio": 0.0, "completion_length": 734.0859375, "epoch": 0.464, "grad_norm": 0.8415294895794072, "kl": 0.19775390625, "learning_rate": 1.2957080500440469e-05, "loss": 0.0961, "reward": 1.96484375, "reward_std": 0.4046466276049614, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1160 }, { "clip_ratio": 0.0, "completion_length": 713.890625, "epoch": 0.4644, "grad_norm": 0.3259752105393027, "kl": 0.1875, "learning_rate": 1.2943739420130837e-05, "loss": 0.0806, "reward": 1.962890625, "reward_std": 0.35018254816532135, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1161 }, { "clip_ratio": 0.0, "completion_length": 715.6015625, "epoch": 0.4648, "grad_norm": 0.7655414984365829, "kl": 0.2177734375, "learning_rate": 1.2930392600850574e-05, "loss": 0.2116, "reward": 1.732421875, "reward_std": 0.6656563878059387, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.818359375, "step": 1162 }, { "clip_ratio": 0.0, "completion_length": 658.3984375, "epoch": 0.4652, "grad_norm": 0.2726798447951227, "kl": 0.16650390625, "learning_rate": 1.291704006861999e-05, "loss": 0.0999, "reward": 2.2421875, "reward_std": 0.39112184196710587, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1163 }, { "clip_ratio": 0.0, "completion_length": 605.8203125, "epoch": 0.4656, "grad_norm": 0.5028252365076595, "kl": 0.184326171875, "learning_rate": 1.2903681849470528e-05, "loss": 0.1388, "reward": 1.9765625, "reward_std": 0.3369347006082535, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1164 }, { "clip_ratio": 0.0, "completion_length": 589.1640625, "epoch": 0.466, "grad_norm": 0.4041802523364028, "kl": 0.175537109375, "learning_rate": 1.2890317969444716e-05, "loss": 0.0854, "reward": 2.06640625, "reward_std": 0.3253549635410309, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1165 }, { "clip_ratio": 0.0, "completion_length": 595.5, "epoch": 0.4664, "grad_norm": 0.37749322512513556, "kl": 0.16064453125, "learning_rate": 1.287694845459613e-05, "loss": 0.1001, "reward": 1.935546875, "reward_std": 0.22315484285354614, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1166 }, { "clip_ratio": 0.0, "completion_length": 523.5625, "epoch": 0.4668, "grad_norm": 2.294737025029553, "kl": 0.20361328125, "learning_rate": 1.2863573330989315e-05, "loss": 0.0605, "reward": 2.09375, "reward_std": 0.24763091653585434, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1167 }, { "clip_ratio": 0.0, "completion_length": 647.109375, "epoch": 0.4672, "grad_norm": 0.72533315962966, "kl": 0.20166015625, "learning_rate": 1.2850192624699762e-05, "loss": 0.0439, "reward": 1.822265625, "reward_std": 0.08818094432353973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.923828125, "step": 1168 }, { "clip_ratio": 0.0, "completion_length": 615.875, "epoch": 0.4676, "grad_norm": 0.41171433573606075, "kl": 0.17919921875, "learning_rate": 1.2836806361813846e-05, "loss": 0.053, "reward": 2.107421875, "reward_std": 0.1811385676264763, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1169 }, { "clip_ratio": 0.0, "completion_length": 621.8046875, "epoch": 0.468, "grad_norm": 0.4224466053474729, "kl": 0.18310546875, "learning_rate": 1.2823414568428767e-05, "loss": 0.1051, "reward": 1.935546875, "reward_std": 0.2578125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1170 }, { "clip_ratio": 0.0, "completion_length": 578.1171875, "epoch": 0.4684, "grad_norm": 0.3188293897459915, "kl": 0.166015625, "learning_rate": 1.2810017270652513e-05, "loss": 0.125, "reward": 2.16015625, "reward_std": 0.3553115501999855, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1171 }, { "clip_ratio": 0.0, "completion_length": 605.2265625, "epoch": 0.4688, "grad_norm": 0.45094289475819194, "kl": 0.17626953125, "learning_rate": 1.27966144946038e-05, "loss": 0.1371, "reward": 2.029296875, "reward_std": 0.3009554073214531, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1172 }, { "clip_ratio": 0.0, "completion_length": 595.578125, "epoch": 0.4692, "grad_norm": 0.3499297715752782, "kl": 0.181640625, "learning_rate": 1.278320626641203e-05, "loss": 0.0813, "reward": 2.033203125, "reward_std": 0.2462654784321785, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1173 }, { "clip_ratio": 0.0, "completion_length": 631.0546875, "epoch": 0.4696, "grad_norm": 1.316722311474284, "kl": 0.257568359375, "learning_rate": 1.2769792612217224e-05, "loss": 0.1459, "reward": 1.931640625, "reward_std": 0.426826536655426, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1174 }, { "clip_ratio": 0.0, "completion_length": 619.890625, "epoch": 0.47, "grad_norm": 0.2365436428294542, "kl": 0.171875, "learning_rate": 1.2756373558169992e-05, "loss": 0.1007, "reward": 1.978515625, "reward_std": 0.15759294480085373, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1175 }, { "clip_ratio": 0.0, "completion_length": 619.5390625, "epoch": 0.4704, "grad_norm": 0.2883984370922046, "kl": 0.16748046875, "learning_rate": 1.2742949130431468e-05, "loss": 0.056, "reward": 2.02734375, "reward_std": 0.2156658098101616, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1176 }, { "clip_ratio": 0.0, "completion_length": 619.9375, "epoch": 0.4708, "grad_norm": 0.242370250452701, "kl": 0.1552734375, "learning_rate": 1.2729519355173254e-05, "loss": 0.0298, "reward": 2.025390625, "reward_std": 0.1675766110420227, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1177 }, { "clip_ratio": 0.0, "completion_length": 604.875, "epoch": 0.4712, "grad_norm": 0.3007849835645844, "kl": 0.162841796875, "learning_rate": 1.2716084258577388e-05, "loss": 0.1277, "reward": 2.125, "reward_std": 0.2641315385699272, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1178 }, { "clip_ratio": 0.0, "completion_length": 653.2421875, "epoch": 0.4716, "grad_norm": 0.26706959714116646, "kl": 0.164306640625, "learning_rate": 1.270264386683628e-05, "loss": 0.0711, "reward": 2.0859375, "reward_std": 0.26144562661647797, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1179 }, { "clip_ratio": 0.0, "completion_length": 659.6875, "epoch": 0.472, "grad_norm": 0.23591625069481925, "kl": 0.15771484375, "learning_rate": 1.2689198206152657e-05, "loss": 0.0079, "reward": 1.978515625, "reward_std": 0.1931464672088623, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 1180 }, { "clip_ratio": 0.0, "completion_length": 715.4453125, "epoch": 0.4724, "grad_norm": 0.14350933625631776, "kl": 0.15771484375, "learning_rate": 1.2675747302739528e-05, "loss": 0.007, "reward": 2.0234375, "reward_std": 0.050389111042022705, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1181 }, { "clip_ratio": 0.0, "completion_length": 774.9453125, "epoch": 0.4728, "grad_norm": 0.26874461720032955, "kl": 0.1572265625, "learning_rate": 1.2662291182820115e-05, "loss": 0.0602, "reward": 1.908203125, "reward_std": 0.3343699425458908, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 1182 }, { "clip_ratio": 0.0, "completion_length": 724.9921875, "epoch": 0.4732, "grad_norm": 0.3116860037119187, "kl": 0.149169921875, "learning_rate": 1.2648829872627809e-05, "loss": 0.0561, "reward": 2.080078125, "reward_std": 0.3419615998864174, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 1183 }, { "clip_ratio": 0.0, "completion_length": 684.6484375, "epoch": 0.4736, "grad_norm": 0.20671958931135598, "kl": 0.14013671875, "learning_rate": 1.263536339840613e-05, "loss": 0.0229, "reward": 1.9375, "reward_std": 0.2213020622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1184 }, { "clip_ratio": 0.0, "completion_length": 750.859375, "epoch": 0.474, "grad_norm": 0.25559209018534806, "kl": 0.147705078125, "learning_rate": 1.2621891786408648e-05, "loss": 0.0716, "reward": 2.17578125, "reward_std": 0.42560143768787384, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96484375, "step": 1185 }, { "clip_ratio": 0.0, "completion_length": 854.234375, "epoch": 0.4744, "grad_norm": 0.26173852495548455, "kl": 0.137939453125, "learning_rate": 1.2608415062898971e-05, "loss": 0.0404, "reward": 1.8203125, "reward_std": 0.3578655272722244, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9296875, "step": 1186 }, { "clip_ratio": 0.0, "completion_length": 754.9609375, "epoch": 0.4748, "grad_norm": 0.2827241932050014, "kl": 0.16064453125, "learning_rate": 1.2594933254150654e-05, "loss": 0.0494, "reward": 2.0859375, "reward_std": 0.3435870110988617, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1187 }, { "clip_ratio": 0.0, "completion_length": 759.1640625, "epoch": 0.4752, "grad_norm": 0.24007736077521763, "kl": 0.1312255859375, "learning_rate": 1.2581446386447178e-05, "loss": 0.0216, "reward": 2.09765625, "reward_std": 0.2980691269040108, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1188 }, { "clip_ratio": 0.0, "completion_length": 640.140625, "epoch": 0.4756, "grad_norm": 0.32063044250368855, "kl": 0.146240234375, "learning_rate": 1.256795448608188e-05, "loss": 0.0708, "reward": 2.173828125, "reward_std": 0.2783287465572357, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1189 }, { "clip_ratio": 0.0, "completion_length": 664.4765625, "epoch": 0.476, "grad_norm": 0.23444076871621347, "kl": 0.150390625, "learning_rate": 1.2554457579357906e-05, "loss": 0.0617, "reward": 2.0234375, "reward_std": 0.2793857827782631, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1190 }, { "clip_ratio": 0.0, "completion_length": 684.59375, "epoch": 0.4764, "grad_norm": 0.2878466376492846, "kl": 0.1474609375, "learning_rate": 1.2540955692588173e-05, "loss": 0.0532, "reward": 2.203125, "reward_std": 0.35236895084381104, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1191 }, { "clip_ratio": 0.0, "completion_length": 636.0390625, "epoch": 0.4768, "grad_norm": 0.8411173232382821, "kl": 0.18017578125, "learning_rate": 1.2527448852095295e-05, "loss": 0.0732, "reward": 1.978515625, "reward_std": 0.30271951854228973, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1192 }, { "clip_ratio": 0.0, "completion_length": 713.6796875, "epoch": 0.4772, "grad_norm": 0.18123004805057039, "kl": 0.153564453125, "learning_rate": 1.251393708421155e-05, "loss": 0.0502, "reward": 2.2109375, "reward_std": 0.263201579451561, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1193 }, { "clip_ratio": 0.0, "completion_length": 588.0546875, "epoch": 0.4776, "grad_norm": 0.37813106686373876, "kl": 0.173828125, "learning_rate": 1.2500420415278822e-05, "loss": 0.1164, "reward": 2.365234375, "reward_std": 0.5092446058988571, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1194 }, { "clip_ratio": 0.0, "completion_length": 604.4140625, "epoch": 0.478, "grad_norm": 0.2531561095753806, "kl": 0.1728515625, "learning_rate": 1.2486898871648552e-05, "loss": 0.034, "reward": 2.134765625, "reward_std": 0.20367393642663956, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1195 }, { "clip_ratio": 0.0, "completion_length": 612.96875, "epoch": 0.4784, "grad_norm": 0.32450863462467916, "kl": 0.15234375, "learning_rate": 1.2473372479681671e-05, "loss": 0.0819, "reward": 2.19140625, "reward_std": 0.3112664967775345, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1196 }, { "clip_ratio": 0.0, "completion_length": 704.796875, "epoch": 0.4788, "grad_norm": 0.25214552021895686, "kl": 0.171875, "learning_rate": 1.2459841265748582e-05, "loss": 0.074, "reward": 2.103515625, "reward_std": 0.3942245543003082, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 1197 }, { "clip_ratio": 0.0, "completion_length": 687.7890625, "epoch": 0.4792, "grad_norm": 1.3795917316959116, "kl": 0.20361328125, "learning_rate": 1.2446305256229074e-05, "loss": 0.1222, "reward": 1.970703125, "reward_std": 0.39133811742067337, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1198 }, { "clip_ratio": 0.0, "completion_length": 669.3828125, "epoch": 0.4796, "grad_norm": 0.35508074895647834, "kl": 0.17578125, "learning_rate": 1.2432764477512294e-05, "loss": 0.0505, "reward": 2.166015625, "reward_std": 0.2923780605196953, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1199 }, { "clip_ratio": 0.0, "completion_length": 732.578125, "epoch": 0.48, "grad_norm": 0.20405316675174207, "kl": 0.146728515625, "learning_rate": 1.2419218955996677e-05, "loss": 0.0205, "reward": 2.169921875, "reward_std": 0.1666145622730255, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1200 }, { "clip_ratio": 0.0, "completion_length": 746.9375, "epoch": 0.4804, "grad_norm": 0.2854313636292743, "kl": 0.187744140625, "learning_rate": 1.2405668718089918e-05, "loss": 0.0296, "reward": 2.044921875, "reward_std": 0.23330553621053696, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.974609375, "step": 1201 }, { "clip_ratio": 0.0, "completion_length": 795.859375, "epoch": 0.4808, "grad_norm": 0.3288602498791611, "kl": 0.162109375, "learning_rate": 1.2392113790208895e-05, "loss": 0.0574, "reward": 2.01953125, "reward_std": 0.4695693925023079, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1202 }, { "clip_ratio": 0.0, "completion_length": 796.4140625, "epoch": 0.4812, "grad_norm": 0.3165414329337399, "kl": 0.17236328125, "learning_rate": 1.2378554198779632e-05, "loss": 0.0781, "reward": 1.787109375, "reward_std": 0.44149020314216614, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.919921875, "step": 1203 }, { "clip_ratio": 0.0, "completion_length": 772.5234375, "epoch": 0.4816, "grad_norm": 0.6301208436194513, "kl": 0.185302734375, "learning_rate": 1.236498997023725e-05, "loss": 0.0641, "reward": 2.013671875, "reward_std": 0.38587944209575653, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1204 }, { "clip_ratio": 0.0, "completion_length": 752.78125, "epoch": 0.482, "grad_norm": 0.45768264879432335, "kl": 0.1796875, "learning_rate": 1.23514211310259e-05, "loss": 0.1062, "reward": 2.09765625, "reward_std": 0.4150552526116371, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94921875, "step": 1205 }, { "clip_ratio": 0.0, "completion_length": 706.328125, "epoch": 0.4824, "grad_norm": 0.48189362338265757, "kl": 0.175048828125, "learning_rate": 1.2337847707598738e-05, "loss": 0.0488, "reward": 2.0, "reward_std": 0.4094544053077698, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1206 }, { "clip_ratio": 0.0, "completion_length": 726.921875, "epoch": 0.4828, "grad_norm": 3.957978090823488, "kl": 0.373779296875, "learning_rate": 1.2324269726417841e-05, "loss": 0.0585, "reward": 2.017578125, "reward_std": 0.3748009651899338, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1207 }, { "clip_ratio": 0.0, "completion_length": 740.1875, "epoch": 0.4832, "grad_norm": 0.3302283494306475, "kl": 0.175048828125, "learning_rate": 1.2310687213954182e-05, "loss": 0.0598, "reward": 1.958984375, "reward_std": 0.2971004769206047, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1208 }, { "clip_ratio": 0.0, "completion_length": 645.0703125, "epoch": 0.4836, "grad_norm": 0.3545855978392219, "kl": 0.19287109375, "learning_rate": 1.2297100196687557e-05, "loss": 0.062, "reward": 2.11328125, "reward_std": 0.34118346124887466, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1209 }, { "clip_ratio": 0.0, "completion_length": 631.71875, "epoch": 0.484, "grad_norm": 0.26970434459713477, "kl": 0.180419921875, "learning_rate": 1.2283508701106559e-05, "loss": 0.045, "reward": 2.044921875, "reward_std": 0.2860976979136467, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1210 }, { "clip_ratio": 0.0, "completion_length": 665.1875, "epoch": 0.4844, "grad_norm": 24.87445611690912, "kl": 0.931640625, "learning_rate": 1.2269912753708502e-05, "loss": 0.0837, "reward": 1.962890625, "reward_std": 0.25791895389556885, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1211 }, { "clip_ratio": 0.0, "completion_length": 619.3046875, "epoch": 0.4848, "grad_norm": 0.3283708178419865, "kl": 0.171142578125, "learning_rate": 1.2256312380999376e-05, "loss": 0.0584, "reward": 1.95703125, "reward_std": 0.30891670286655426, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.97265625, "step": 1212 }, { "clip_ratio": 0.0, "completion_length": 616.5078125, "epoch": 0.4852, "grad_norm": 0.4674455526149219, "kl": 0.177001953125, "learning_rate": 1.2242707609493814e-05, "loss": 0.0934, "reward": 2.068359375, "reward_std": 0.5603287145495415, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.7890625, "rewards/tag_count_reward": 0.927734375, "step": 1213 }, { "clip_ratio": 0.0, "completion_length": 613.0546875, "epoch": 0.4856, "grad_norm": 0.4523991671715294, "kl": 0.175537109375, "learning_rate": 1.2229098465715005e-05, "loss": 0.0324, "reward": 1.9453125, "reward_std": 0.4869326949119568, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.796875, "rewards/tag_count_reward": 0.9375, "step": 1214 }, { "clip_ratio": 0.0, "completion_length": 592.03125, "epoch": 0.486, "grad_norm": 0.4375754441285744, "kl": 0.16650390625, "learning_rate": 1.2215484976194675e-05, "loss": 0.0753, "reward": 1.689453125, "reward_std": 0.5414498075842857, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.765625, "rewards/tag_count_reward": 0.923828125, "step": 1215 }, { "clip_ratio": 0.0, "completion_length": 566.8203125, "epoch": 0.4864, "grad_norm": 0.5595659542621998, "kl": 0.22021484375, "learning_rate": 1.2201867167473015e-05, "loss": 0.0234, "reward": 1.810546875, "reward_std": 0.45272092521190643, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.958984375, "step": 1216 }, { "clip_ratio": 0.0, "completion_length": 580.1328125, "epoch": 0.4868, "grad_norm": 0.34765277645646403, "kl": 0.161865234375, "learning_rate": 1.2188245066098647e-05, "loss": 0.0215, "reward": 1.966796875, "reward_std": 0.3652081787586212, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.951171875, "step": 1217 }, { "clip_ratio": 0.0, "completion_length": 560.3828125, "epoch": 0.4872, "grad_norm": 0.6270219180733887, "kl": 0.2138671875, "learning_rate": 1.217461869862855e-05, "loss": 0.0647, "reward": 1.921875, "reward_std": 0.47029872238636017, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.9453125, "step": 1218 }, { "clip_ratio": 0.0, "completion_length": 658.265625, "epoch": 0.4876, "grad_norm": 0.37473625031615254, "kl": 0.17578125, "learning_rate": 1.2160988091628023e-05, "loss": 0.0699, "reward": 1.8359375, "reward_std": 0.5484568998217583, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.9375, "step": 1219 }, { "clip_ratio": 0.0, "completion_length": 557.4375, "epoch": 0.488, "grad_norm": 0.382482897291309, "kl": 0.198974609375, "learning_rate": 1.2147353271670634e-05, "loss": 0.1068, "reward": 1.876953125, "reward_std": 0.31427812576293945, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.962890625, "step": 1220 }, { "clip_ratio": 0.0, "completion_length": 594.5078125, "epoch": 0.4884, "grad_norm": 0.3620223487822809, "kl": 0.1826171875, "learning_rate": 1.2133714265338162e-05, "loss": 0.0455, "reward": 1.939453125, "reward_std": 0.3460022658109665, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.970703125, "step": 1221 }, { "clip_ratio": 0.0, "completion_length": 510.2265625, "epoch": 0.4888, "grad_norm": 0.5235962025627678, "kl": 0.20068359375, "learning_rate": 1.212007109922055e-05, "loss": 0.0752, "reward": 2.03125, "reward_std": 0.38787609338760376, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.96875, "step": 1222 }, { "clip_ratio": 0.0, "completion_length": 609.28125, "epoch": 0.4892, "grad_norm": 0.23465607525699925, "kl": 0.180908203125, "learning_rate": 1.2106423799915841e-05, "loss": 0.0077, "reward": 1.998046875, "reward_std": 0.1519516110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.990234375, "step": 1223 }, { "clip_ratio": 0.0, "completion_length": 606.7265625, "epoch": 0.4896, "grad_norm": 0.40867451758445605, "kl": 0.216552734375, "learning_rate": 1.2092772394030153e-05, "loss": 0.1355, "reward": 1.90234375, "reward_std": 0.48701460659503937, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.93359375, "step": 1224 }, { "clip_ratio": 0.0, "completion_length": 600.3359375, "epoch": 0.49, "grad_norm": 0.33082733714422774, "kl": 0.18798828125, "learning_rate": 1.2079116908177592e-05, "loss": 0.0613, "reward": 1.921875, "reward_std": 0.33258819580078125, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9609375, "step": 1225 }, { "clip_ratio": 0.0, "completion_length": 657.71875, "epoch": 0.4904, "grad_norm": 0.3713814231864422, "kl": 0.19580078125, "learning_rate": 1.2065457368980236e-05, "loss": 0.0247, "reward": 1.98046875, "reward_std": 0.39576253294944763, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.97265625, "step": 1226 }, { "clip_ratio": 0.0, "completion_length": 701.21875, "epoch": 0.4908, "grad_norm": 0.29973796184206397, "kl": 0.182861328125, "learning_rate": 1.2051793803068046e-05, "loss": 0.0947, "reward": 1.908203125, "reward_std": 0.3854259252548218, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.955078125, "step": 1227 }, { "clip_ratio": 0.0, "completion_length": 728.359375, "epoch": 0.4912, "grad_norm": 0.33784405210330687, "kl": 0.1767578125, "learning_rate": 1.203812623707885e-05, "loss": 0.1093, "reward": 1.873046875, "reward_std": 0.41645148396492004, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.951171875, "step": 1228 }, { "clip_ratio": 0.0, "completion_length": 755.4453125, "epoch": 0.4916, "grad_norm": 0.31758867766981114, "kl": 0.193359375, "learning_rate": 1.202445469765826e-05, "loss": 0.079, "reward": 1.98828125, "reward_std": 0.3468427509069443, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.95703125, "step": 1229 }, { "clip_ratio": 0.0, "completion_length": 707.4609375, "epoch": 0.492, "grad_norm": 0.37736394177340404, "kl": 0.194091796875, "learning_rate": 1.2010779211459649e-05, "loss": 0.1101, "reward": 1.96484375, "reward_std": 0.5845633521676064, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.93359375, "step": 1230 }, { "clip_ratio": 0.0, "completion_length": 679.1640625, "epoch": 0.4924, "grad_norm": 0.38872042117674505, "kl": 0.19482421875, "learning_rate": 1.1997099805144071e-05, "loss": 0.177, "reward": 1.85546875, "reward_std": 0.417943611741066, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.92578125, "step": 1231 }, { "clip_ratio": 0.0, "completion_length": 704.390625, "epoch": 0.4928, "grad_norm": 0.3558096282609534, "kl": 0.177978515625, "learning_rate": 1.1983416505380234e-05, "loss": 0.0913, "reward": 1.94140625, "reward_std": 0.3148151636123657, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1232 }, { "clip_ratio": 0.0, "completion_length": 776.2109375, "epoch": 0.4932, "grad_norm": 0.3059084210677874, "kl": 0.1962890625, "learning_rate": 1.1969729338844429e-05, "loss": 0.0914, "reward": 1.935546875, "reward_std": 0.5180581882596016, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.935546875, "step": 1233 }, { "clip_ratio": 0.0, "completion_length": 594.15625, "epoch": 0.4936, "grad_norm": 1.5703400435919281, "kl": 0.228271484375, "learning_rate": 1.1956038332220484e-05, "loss": 0.133, "reward": 1.982421875, "reward_std": 0.2913404181599617, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 1234 }, { "clip_ratio": 0.0, "completion_length": 605.328125, "epoch": 0.494, "grad_norm": 0.36783936220058877, "kl": 0.184326171875, "learning_rate": 1.194234351219972e-05, "loss": 0.1103, "reward": 1.908203125, "reward_std": 0.26260800659656525, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1235 }, { "clip_ratio": 0.0, "completion_length": 716.59375, "epoch": 0.4944, "grad_norm": 0.5585986485708608, "kl": 0.216552734375, "learning_rate": 1.192864490548089e-05, "loss": 0.0683, "reward": 1.849609375, "reward_std": 0.32887736707925797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 1236 }, { "clip_ratio": 0.0, "completion_length": 717.6953125, "epoch": 0.4948, "grad_norm": 0.6066969514664764, "kl": 0.1865234375, "learning_rate": 1.191494253877013e-05, "loss": 0.074, "reward": 1.91015625, "reward_std": 0.2977629229426384, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 1237 }, { "clip_ratio": 0.0, "completion_length": 683.921875, "epoch": 0.4952, "grad_norm": 0.3308097792920367, "kl": 0.175537109375, "learning_rate": 1.1901236438780902e-05, "loss": 0.1303, "reward": 2.001953125, "reward_std": 0.3816482871770859, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1238 }, { "clip_ratio": 0.0, "completion_length": 675.40625, "epoch": 0.4956, "grad_norm": 0.9010095938348689, "kl": 0.21240234375, "learning_rate": 1.1887526632233954e-05, "loss": 0.1191, "reward": 2.029296875, "reward_std": 0.34815484285354614, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1239 }, { "clip_ratio": 0.0, "completion_length": 783.28125, "epoch": 0.496, "grad_norm": 0.35503271978390116, "kl": 0.19775390625, "learning_rate": 1.187381314585725e-05, "loss": 0.1055, "reward": 1.73046875, "reward_std": 0.47014833241701126, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.89453125, "step": 1240 }, { "clip_ratio": 0.0, "completion_length": 729.078125, "epoch": 0.4964, "grad_norm": 0.5391985849469076, "kl": 0.20068359375, "learning_rate": 1.186009600638593e-05, "loss": 0.1365, "reward": 1.875, "reward_std": 0.48214006423950195, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1241 }, { "clip_ratio": 0.0, "completion_length": 703.9375, "epoch": 0.4968, "grad_norm": 2.143997014158882, "kl": 0.231689453125, "learning_rate": 1.184637524056227e-05, "loss": 0.1579, "reward": 1.978515625, "reward_std": 0.5746912732720375, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.892578125, "step": 1242 }, { "clip_ratio": 0.0, "completion_length": 793.140625, "epoch": 0.4972, "grad_norm": 0.8291768693907137, "kl": 0.31494140625, "learning_rate": 1.1832650875135599e-05, "loss": 0.1868, "reward": 1.439453125, "reward_std": 0.751770555973053, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.671875, "rewards/tag_count_reward": 0.767578125, "step": 1243 }, { "clip_ratio": 0.0, "completion_length": 744.6171875, "epoch": 0.4976, "grad_norm": 0.926819394803899, "kl": 0.28955078125, "learning_rate": 1.181892293686227e-05, "loss": 0.2012, "reward": 1.693359375, "reward_std": 0.7271936386823654, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.818359375, "step": 1244 }, { "clip_ratio": 0.0, "completion_length": 767.3125, "epoch": 0.498, "grad_norm": 1.0807539846606962, "kl": 0.33251953125, "learning_rate": 1.1805191452505602e-05, "loss": 0.2117, "reward": 1.375, "reward_std": 0.7430383265018463, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.640625, "rewards/tag_count_reward": 0.734375, "step": 1245 }, { "clip_ratio": 0.0, "completion_length": 831.6875, "epoch": 0.4984, "grad_norm": 1.2623545458798362, "kl": 0.2880859375, "learning_rate": 1.1791456448835825e-05, "loss": 0.1604, "reward": 1.60546875, "reward_std": 0.7711075246334076, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.640625, "rewards/tag_count_reward": 0.73046875, "step": 1246 }, { "clip_ratio": 0.0, "completion_length": 699.3125, "epoch": 0.4988, "grad_norm": 1.0181205629040544, "kl": 0.21826171875, "learning_rate": 1.1777717952630033e-05, "loss": 0.2184, "reward": 1.578125, "reward_std": 0.6330820247530937, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.75, "rewards/tag_count_reward": 0.828125, "step": 1247 }, { "clip_ratio": 0.0, "completion_length": 623.5234375, "epoch": 0.4992, "grad_norm": 2.901128958443841, "kl": 0.208984375, "learning_rate": 1.1763975990672125e-05, "loss": 0.2322, "reward": 1.927734375, "reward_std": 0.5155502706766129, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.912109375, "step": 1248 }, { "clip_ratio": 0.0, "completion_length": 675.421875, "epoch": 0.4996, "grad_norm": 1.421173092169443, "kl": 0.3134765625, "learning_rate": 1.1750230589752763e-05, "loss": 0.2068, "reward": 1.8203125, "reward_std": 0.6331717222929001, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.90625, "step": 1249 }, { "clip_ratio": 0.0, "completion_length": 701.5, "epoch": 0.5, "grad_norm": 0.8935445607902112, "kl": 0.220458984375, "learning_rate": 1.1736481776669307e-05, "loss": 0.2058, "reward": 1.763671875, "reward_std": 0.7105149924755096, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.857421875, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 757.7265625, "epoch": 0.5004, "grad_norm": 1.1078608197557502, "kl": 0.238525390625, "learning_rate": 1.1722729578225769e-05, "loss": 0.1885, "reward": 1.625, "reward_std": 0.6709536537528038, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.7734375, "rewards/tag_count_reward": 0.84375, "step": 1251 }, { "clip_ratio": 0.0, "completion_length": 766.4453125, "epoch": 0.5008, "grad_norm": 1.8381903198320233, "kl": 0.28955078125, "learning_rate": 1.1708974021232768e-05, "loss": 0.2631, "reward": 1.427734375, "reward_std": 0.8358160927891731, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.5703125, "rewards/tag_count_reward": 0.677734375, "step": 1252 }, { "clip_ratio": 0.0, "completion_length": 738.546875, "epoch": 0.5012, "grad_norm": 2.3691204337568235, "kl": 0.24755859375, "learning_rate": 1.1695215132507465e-05, "loss": 0.2475, "reward": 1.49609375, "reward_std": 0.7725977003574371, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.703125, "rewards/tag_count_reward": 0.78515625, "step": 1253 }, { "clip_ratio": 0.0, "completion_length": 678.2265625, "epoch": 0.5016, "grad_norm": 1.1020339205305678, "kl": 0.205810546875, "learning_rate": 1.1681452938873516e-05, "loss": 0.1678, "reward": 1.78515625, "reward_std": 0.492422953248024, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.91015625, "step": 1254 }, { "clip_ratio": 0.0, "completion_length": 634.640625, "epoch": 0.502, "grad_norm": 0.5993603355030284, "kl": 0.183837890625, "learning_rate": 1.1667687467161025e-05, "loss": 0.1923, "reward": 1.72265625, "reward_std": 0.48793354630470276, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.87890625, "step": 1255 }, { "clip_ratio": 0.0, "completion_length": 641.765625, "epoch": 0.5024, "grad_norm": 0.7173207290537785, "kl": 0.177978515625, "learning_rate": 1.1653918744206478e-05, "loss": 0.1702, "reward": 1.830078125, "reward_std": 0.4413367807865143, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.923828125, "step": 1256 }, { "clip_ratio": 0.0, "completion_length": 693.53125, "epoch": 0.5028, "grad_norm": 0.515529811184722, "kl": 0.20263671875, "learning_rate": 1.1640146796852711e-05, "loss": 0.17, "reward": 1.732421875, "reward_std": 0.5879864320158958, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.880859375, "step": 1257 }, { "clip_ratio": 0.0, "completion_length": 596.8671875, "epoch": 0.5032, "grad_norm": 0.9712832612175974, "kl": 0.193115234375, "learning_rate": 1.1626371651948839e-05, "loss": 0.1135, "reward": 1.86328125, "reward_std": 0.3617328256368637, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.93359375, "step": 1258 }, { "clip_ratio": 0.0, "completion_length": 634.21875, "epoch": 0.5036, "grad_norm": 4.055746220808953, "kl": 0.204833984375, "learning_rate": 1.1612593336350209e-05, "loss": 0.1655, "reward": 1.85546875, "reward_std": 0.43460480868816376, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 1259 }, { "clip_ratio": 0.0, "completion_length": 651.6640625, "epoch": 0.504, "grad_norm": 0.5058477221762562, "kl": 0.200439453125, "learning_rate": 1.159881187691835e-05, "loss": 0.2376, "reward": 1.689453125, "reward_std": 0.5999239087104797, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.869140625, "step": 1260 }, { "clip_ratio": 0.0, "completion_length": 728.6640625, "epoch": 0.5044, "grad_norm": 0.418073470905776, "kl": 0.181884765625, "learning_rate": 1.158502730052093e-05, "loss": 0.165, "reward": 1.68359375, "reward_std": 0.6391264796257019, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.86328125, "step": 1261 }, { "clip_ratio": 0.0, "completion_length": 642.3359375, "epoch": 0.5048, "grad_norm": 0.52652657289956, "kl": 0.21484375, "learning_rate": 1.157123963403168e-05, "loss": 0.2, "reward": 1.953125, "reward_std": 0.6154344156384468, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.875, "step": 1262 }, { "clip_ratio": 0.0, "completion_length": 718.265625, "epoch": 0.5052, "grad_norm": 0.55635816483139, "kl": 0.212158203125, "learning_rate": 1.1557448904330362e-05, "loss": 0.1552, "reward": 1.67578125, "reward_std": 0.5950021520256996, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.86328125, "step": 1263 }, { "clip_ratio": 0.0, "completion_length": 776.078125, "epoch": 0.5056, "grad_norm": 0.6753627805163983, "kl": 0.191162109375, "learning_rate": 1.1543655138302714e-05, "loss": 0.2079, "reward": 1.515625, "reward_std": 0.7539601176977158, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.765625, "step": 1264 }, { "clip_ratio": 0.0, "completion_length": 790.28125, "epoch": 0.506, "grad_norm": 0.5960606089331012, "kl": 0.1751708984375, "learning_rate": 1.1529858362840383e-05, "loss": 0.1156, "reward": 1.7578125, "reward_std": 0.5439182966947556, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.90625, "step": 1265 }, { "clip_ratio": 0.0, "completion_length": 728.1484375, "epoch": 0.5064, "grad_norm": 0.9546646845913983, "kl": 0.1982421875, "learning_rate": 1.1516058604840891e-05, "loss": 0.0876, "reward": 1.73046875, "reward_std": 0.4048089310526848, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.87109375, "step": 1266 }, { "clip_ratio": 0.0, "completion_length": 716.3515625, "epoch": 0.5068, "grad_norm": 1.3264053060645031, "kl": 0.18603515625, "learning_rate": 1.1502255891207572e-05, "loss": 0.1527, "reward": 1.80859375, "reward_std": 0.612016499042511, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.89453125, "step": 1267 }, { "clip_ratio": 0.0, "completion_length": 810.984375, "epoch": 0.5072, "grad_norm": 0.36066696302117485, "kl": 0.168212890625, "learning_rate": 1.1488450248849523e-05, "loss": 0.112, "reward": 1.630859375, "reward_std": 0.6139557883143425, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.849609375, "step": 1268 }, { "clip_ratio": 0.0, "completion_length": 781.8125, "epoch": 0.5076, "grad_norm": 1.5226360645077626, "kl": 0.261962890625, "learning_rate": 1.1474641704681551e-05, "loss": 0.2106, "reward": 1.4765625, "reward_std": 0.8296866118907928, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.6875, "rewards/tag_count_reward": 0.7734375, "step": 1269 }, { "clip_ratio": 0.0, "completion_length": 705.3515625, "epoch": 0.508, "grad_norm": 0.7632124657922839, "kl": 0.20068359375, "learning_rate": 1.1460830285624119e-05, "loss": 0.1743, "reward": 1.599609375, "reward_std": 0.60723827034235, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.826171875, "step": 1270 }, { "clip_ratio": 0.0, "completion_length": 788.296875, "epoch": 0.5084, "grad_norm": 0.37065356734676397, "kl": 0.181884765625, "learning_rate": 1.1447016018603293e-05, "loss": 0.1228, "reward": 1.7109375, "reward_std": 0.5739647224545479, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.875, "step": 1271 }, { "clip_ratio": 0.0, "completion_length": 767.7421875, "epoch": 0.5088, "grad_norm": 1.0007932956504564, "kl": 0.2138671875, "learning_rate": 1.1433198930550694e-05, "loss": 0.0976, "reward": 1.8828125, "reward_std": 0.3884742856025696, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1272 }, { "clip_ratio": 0.0, "completion_length": 755.140625, "epoch": 0.5092, "grad_norm": 1.9254017585807106, "kl": 0.249267578125, "learning_rate": 1.1419379048403446e-05, "loss": 0.1051, "reward": 1.935546875, "reward_std": 0.42955319583415985, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.904296875, "step": 1273 }, { "clip_ratio": 0.0, "completion_length": 745.5078125, "epoch": 0.5096, "grad_norm": 1.1909077887991042, "kl": 0.18115234375, "learning_rate": 1.140555639910411e-05, "loss": 0.047, "reward": 2.03125, "reward_std": 0.33251146972179413, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1274 }, { "clip_ratio": 0.0, "completion_length": 752.9140625, "epoch": 0.51, "grad_norm": 0.2894843492683282, "kl": 0.18994140625, "learning_rate": 1.1391731009600655e-05, "loss": 0.0273, "reward": 1.921875, "reward_std": 0.15447160601615906, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1275 }, { "clip_ratio": 0.0, "completion_length": 793.3046875, "epoch": 0.5104, "grad_norm": 0.4523433510665805, "kl": 0.1865234375, "learning_rate": 1.137790290684638e-05, "loss": 0.0426, "reward": 1.947265625, "reward_std": 0.3610261231660843, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.962890625, "step": 1276 }, { "clip_ratio": 0.0, "completion_length": 740.84375, "epoch": 0.5108, "grad_norm": 0.19773866979571303, "kl": 0.212158203125, "learning_rate": 1.1364072117799884e-05, "loss": 0.0181, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1277 }, { "clip_ratio": 0.0, "completion_length": 700.7734375, "epoch": 0.5112, "grad_norm": 0.26101112668216814, "kl": 0.1953125, "learning_rate": 1.1350238669424993e-05, "loss": 0.0508, "reward": 2.166015625, "reward_std": 0.20259655267000198, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 1278 }, { "clip_ratio": 0.0, "completion_length": 710.390625, "epoch": 0.5116, "grad_norm": 0.23079648520082027, "kl": 0.19873046875, "learning_rate": 1.1336402588690727e-05, "loss": 0.0431, "reward": 1.95703125, "reward_std": 0.215688094496727, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1279 }, { "clip_ratio": 0.0, "completion_length": 613.9609375, "epoch": 0.512, "grad_norm": 0.23894433232757445, "kl": 0.214599609375, "learning_rate": 1.1322563902571227e-05, "loss": 0.0639, "reward": 2.01171875, "reward_std": 0.21076399087905884, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1280 }, { "clip_ratio": 0.0, "completion_length": 644.6875, "epoch": 0.5124, "grad_norm": 1.2435825822343174, "kl": 0.21728515625, "learning_rate": 1.1308722638045724e-05, "loss": 0.0376, "reward": 2.04296875, "reward_std": 0.15030688047409058, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1281 }, { "clip_ratio": 0.0, "completion_length": 611.1484375, "epoch": 0.5128, "grad_norm": 0.1482904764023001, "kl": 0.175048828125, "learning_rate": 1.129487882209847e-05, "loss": 0.0287, "reward": 2.111328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1282 }, { "clip_ratio": 0.0, "completion_length": 573.359375, "epoch": 0.5132, "grad_norm": 0.29167826721835366, "kl": 0.22314453125, "learning_rate": 1.1281032481718696e-05, "loss": 0.1321, "reward": 1.97265625, "reward_std": 0.1923333778977394, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.93359375, "step": 1283 }, { "clip_ratio": 0.0, "completion_length": 636.6015625, "epoch": 0.5136, "grad_norm": 0.2937708378812805, "kl": 0.24072265625, "learning_rate": 1.1267183643900548e-05, "loss": 0.0971, "reward": 2.10546875, "reward_std": 0.2222641110420227, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1284 }, { "clip_ratio": 0.0, "completion_length": 681.8671875, "epoch": 0.514, "grad_norm": 0.25674487375740723, "kl": 0.194091796875, "learning_rate": 1.1253332335643043e-05, "loss": -0.0005, "reward": 2.1171875, "reward_std": 0.12984732538461685, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1285 }, { "clip_ratio": 0.0, "completion_length": 659.7109375, "epoch": 0.5144, "grad_norm": 0.11139540037122582, "kl": 0.189453125, "learning_rate": 1.1239478583950019e-05, "loss": 0.0056, "reward": 2.203125, "reward_std": 0.0625, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1286 }, { "clip_ratio": 0.0, "completion_length": 702.3125, "epoch": 0.5148, "grad_norm": 0.28469516002876244, "kl": 0.206787109375, "learning_rate": 1.1225622415830068e-05, "loss": 0.0508, "reward": 2.05859375, "reward_std": 0.16921419650316238, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1287 }, { "clip_ratio": 0.0, "completion_length": 578.890625, "epoch": 0.5152, "grad_norm": 0.07080285048389838, "kl": 0.19580078125, "learning_rate": 1.1211763858296507e-05, "loss": 0.0079, "reward": 2.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1288 }, { "clip_ratio": 0.0, "completion_length": 739.4921875, "epoch": 0.5156, "grad_norm": 0.2646045004438993, "kl": 0.19921875, "learning_rate": 1.1197902938367297e-05, "loss": 0.0238, "reward": 1.9453125, "reward_std": 0.24659235030412674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 1289 }, { "clip_ratio": 0.0, "completion_length": 740.0078125, "epoch": 0.516, "grad_norm": 0.19764995761250673, "kl": 0.195068359375, "learning_rate": 1.1184039683065014e-05, "loss": 0.0383, "reward": 1.97265625, "reward_std": 0.153188094496727, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1290 }, { "clip_ratio": 0.0, "completion_length": 731.015625, "epoch": 0.5164, "grad_norm": 0.07270216424769538, "kl": 0.207275390625, "learning_rate": 1.1170174119416778e-05, "loss": 0.0083, "reward": 2.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1291 }, { "clip_ratio": 0.0, "completion_length": 815.6953125, "epoch": 0.5168, "grad_norm": 0.25975898995382946, "kl": 0.203857421875, "learning_rate": 1.1156306274454218e-05, "loss": 0.0677, "reward": 1.853515625, "reward_std": 0.39350471645593643, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.947265625, "step": 1292 }, { "clip_ratio": 0.0, "completion_length": 726.46875, "epoch": 0.5172, "grad_norm": 0.7039995255689749, "kl": 0.2236328125, "learning_rate": 1.1142436175213409e-05, "loss": 0.0567, "reward": 2.0078125, "reward_std": 0.2868804335594177, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1293 }, { "clip_ratio": 0.0, "completion_length": 782.4140625, "epoch": 0.5176, "grad_norm": 0.309326840226664, "kl": 0.19091796875, "learning_rate": 1.1128563848734817e-05, "loss": 0.0381, "reward": 1.904296875, "reward_std": 0.373760923743248, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1294 }, { "clip_ratio": 0.0, "completion_length": 728.875, "epoch": 0.518, "grad_norm": 0.32281016055191375, "kl": 0.177490234375, "learning_rate": 1.1114689322063255e-05, "loss": 0.0434, "reward": 2.033203125, "reward_std": 0.2649558112025261, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1295 }, { "clip_ratio": 0.0, "completion_length": 762.40625, "epoch": 0.5184, "grad_norm": 0.1716152258698796, "kl": 0.18212890625, "learning_rate": 1.1100812622247823e-05, "loss": 0.0255, "reward": 2.0703125, "reward_std": 0.12980970740318298, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.984375, "step": 1296 }, { "clip_ratio": 0.0, "completion_length": 664.8125, "epoch": 0.5188, "grad_norm": 0.4015057152410443, "kl": 0.232666015625, "learning_rate": 1.1086933776341853e-05, "loss": 0.144, "reward": 2.009765625, "reward_std": 0.3314001113176346, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1297 }, { "clip_ratio": 0.0, "completion_length": 779.078125, "epoch": 0.5192, "grad_norm": 0.2406322139834257, "kl": 0.201171875, "learning_rate": 1.1073052811402867e-05, "loss": 0.048, "reward": 1.857421875, "reward_std": 0.35726869851350784, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.943359375, "step": 1298 }, { "clip_ratio": 0.0, "completion_length": 729.859375, "epoch": 0.5196, "grad_norm": 0.22875486072285325, "kl": 0.194580078125, "learning_rate": 1.105916975449252e-05, "loss": 0.047, "reward": 2.099609375, "reward_std": 0.2265625, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1299 }, { "clip_ratio": 0.0, "completion_length": 684.5, "epoch": 0.52, "grad_norm": 0.26184748934564395, "kl": 0.20361328125, "learning_rate": 1.1045284632676535e-05, "loss": 0.0478, "reward": 1.974609375, "reward_std": 0.20539497584104538, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1300 }, { "clip_ratio": 0.0, "completion_length": 700.0, "epoch": 0.5204, "grad_norm": 0.3280322892714436, "kl": 0.190673828125, "learning_rate": 1.1031397473024674e-05, "loss": 0.0389, "reward": 1.99609375, "reward_std": 0.24411680549383163, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1301 }, { "clip_ratio": 0.0, "completion_length": 612.1015625, "epoch": 0.5208, "grad_norm": 0.25383085283260076, "kl": 0.199951171875, "learning_rate": 1.1017508302610665e-05, "loss": 0.0174, "reward": 2.017578125, "reward_std": 0.1363266110420227, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1302 }, { "clip_ratio": 0.0, "completion_length": 650.6484375, "epoch": 0.5212, "grad_norm": 0.6134894984610627, "kl": 0.19775390625, "learning_rate": 1.1003617148512149e-05, "loss": 0.0743, "reward": 1.994140625, "reward_std": 0.3969717398285866, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1303 }, { "clip_ratio": 0.0, "completion_length": 626.7578125, "epoch": 0.5216, "grad_norm": 0.3062299989803775, "kl": 0.20361328125, "learning_rate": 1.0989724037810651e-05, "loss": 0.0293, "reward": 1.98828125, "reward_std": 0.1174129769206047, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1304 }, { "clip_ratio": 0.0, "completion_length": 671.6015625, "epoch": 0.522, "grad_norm": 0.18360624279572418, "kl": 0.189697265625, "learning_rate": 1.0975828997591496e-05, "loss": 0.0742, "reward": 1.939453125, "reward_std": 0.2076384201645851, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1305 }, { "clip_ratio": 0.0, "completion_length": 693.3984375, "epoch": 0.5224, "grad_norm": 0.20981959875310613, "kl": 0.205322265625, "learning_rate": 1.0961932054943778e-05, "loss": 0.096, "reward": 1.9296875, "reward_std": 0.24659234285354614, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 1306 }, { "clip_ratio": 0.0, "completion_length": 621.265625, "epoch": 0.5228, "grad_norm": 0.32601736577637186, "kl": 0.17431640625, "learning_rate": 1.0948033236960294e-05, "loss": 0.057, "reward": 2.212890625, "reward_std": 0.2734375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1307 }, { "clip_ratio": 0.0, "completion_length": 741.984375, "epoch": 0.5232, "grad_norm": 0.33335057846705923, "kl": 0.1845703125, "learning_rate": 1.0934132570737508e-05, "loss": 0.0712, "reward": 1.96484375, "reward_std": 0.3578447327017784, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1308 }, { "clip_ratio": 0.0, "completion_length": 668.5625, "epoch": 0.5236, "grad_norm": 0.22048742674081748, "kl": 0.156494140625, "learning_rate": 1.0920230083375474e-05, "loss": 0.0273, "reward": 1.98046875, "reward_std": 0.11945747584104538, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1309 }, { "clip_ratio": 0.0, "completion_length": 758.046875, "epoch": 0.524, "grad_norm": 0.3617680586610384, "kl": 0.21240234375, "learning_rate": 1.0906325801977804e-05, "loss": 0.0962, "reward": 1.91796875, "reward_std": 0.44962530583143234, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.91796875, "step": 1310 }, { "clip_ratio": 0.0, "completion_length": 733.7734375, "epoch": 0.5244, "grad_norm": 0.8203995572690925, "kl": 0.237548828125, "learning_rate": 1.0892419753651606e-05, "loss": 0.0833, "reward": 2.00390625, "reward_std": 0.477314792573452, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1311 }, { "clip_ratio": 0.0, "completion_length": 681.6796875, "epoch": 0.5248, "grad_norm": 0.3302833441887552, "kl": 0.2265625, "learning_rate": 1.0878511965507435e-05, "loss": 0.0803, "reward": 2.083984375, "reward_std": 0.42606962472200394, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1312 }, { "clip_ratio": 0.0, "completion_length": 724.421875, "epoch": 0.5252, "grad_norm": 0.22556264546856822, "kl": 0.19482421875, "learning_rate": 1.086460246465923e-05, "loss": 0.0209, "reward": 1.931640625, "reward_std": 0.2021680325269699, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1313 }, { "clip_ratio": 0.0, "completion_length": 771.5859375, "epoch": 0.5256, "grad_norm": 0.24986470070022346, "kl": 0.18994140625, "learning_rate": 1.0850691278224282e-05, "loss": 0.0304, "reward": 2.12109375, "reward_std": 0.2663499787449837, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1314 }, { "clip_ratio": 0.0, "completion_length": 770.03125, "epoch": 0.526, "grad_norm": 0.40560992288496495, "kl": 0.1708984375, "learning_rate": 1.083677843332316e-05, "loss": 0.025, "reward": 2.125, "reward_std": 0.1441391110420227, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1315 }, { "clip_ratio": 0.0, "completion_length": 734.2421875, "epoch": 0.5264, "grad_norm": 0.39643651770621496, "kl": 0.20166015625, "learning_rate": 1.0822863957079657e-05, "loss": 0.0404, "reward": 2.013671875, "reward_std": 0.32607005536556244, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1316 }, { "clip_ratio": 0.0, "completion_length": 885.25, "epoch": 0.5268, "grad_norm": 0.2876149393925782, "kl": 0.171142578125, "learning_rate": 1.0808947876620768e-05, "loss": 0.0417, "reward": 1.92578125, "reward_std": 0.5040371343493462, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.92578125, "step": 1317 }, { "clip_ratio": 0.0, "completion_length": 821.0390625, "epoch": 0.5272, "grad_norm": 0.35888461614927936, "kl": 0.18017578125, "learning_rate": 1.07950302190766e-05, "loss": 0.0628, "reward": 1.890625, "reward_std": 0.3751693144440651, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9609375, "step": 1318 }, { "clip_ratio": 0.0, "completion_length": 756.890625, "epoch": 0.5276, "grad_norm": 0.2832631740563035, "kl": 0.181396484375, "learning_rate": 1.0781111011580336e-05, "loss": 0.029, "reward": 1.97265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1319 }, { "clip_ratio": 0.0, "completion_length": 752.2421875, "epoch": 0.528, "grad_norm": 0.1776159305006657, "kl": 0.1767578125, "learning_rate": 1.0767190281268187e-05, "loss": 0.0105, "reward": 1.994140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1320 }, { "clip_ratio": 0.0, "completion_length": 773.0859375, "epoch": 0.5284, "grad_norm": 0.20196597674822953, "kl": 0.179931640625, "learning_rate": 1.0753268055279328e-05, "loss": 0.0382, "reward": 1.984375, "reward_std": 0.25540684908628464, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1321 }, { "clip_ratio": 0.0, "completion_length": 751.8203125, "epoch": 0.5288, "grad_norm": 0.3027095835467332, "kl": 0.194580078125, "learning_rate": 1.0739344360755853e-05, "loss": 0.0535, "reward": 1.857421875, "reward_std": 0.36933494359254837, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.943359375, "step": 1322 }, { "clip_ratio": 0.0, "completion_length": 753.796875, "epoch": 0.5292, "grad_norm": 0.21859761557248514, "kl": 0.175048828125, "learning_rate": 1.072541922484271e-05, "loss": 0.0226, "reward": 1.966796875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1323 }, { "clip_ratio": 0.0, "completion_length": 685.1875, "epoch": 0.5296, "grad_norm": 0.25026270280441665, "kl": 0.1607666015625, "learning_rate": 1.071149267468767e-05, "loss": 0.0502, "reward": 2.0703125, "reward_std": 0.18409235030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1324 }, { "clip_ratio": 0.0, "completion_length": 763.7890625, "epoch": 0.53, "grad_norm": 0.2638021380738691, "kl": 0.1669921875, "learning_rate": 1.0697564737441254e-05, "loss": 0.085, "reward": 1.9609375, "reward_std": 0.39252420514822006, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1325 }, { "clip_ratio": 0.0, "completion_length": 733.7421875, "epoch": 0.5304, "grad_norm": 0.4070841836124909, "kl": 0.17041015625, "learning_rate": 1.0683635440256689e-05, "loss": 0.0427, "reward": 2.068359375, "reward_std": 0.29339665174484253, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1326 }, { "clip_ratio": 0.0, "completion_length": 803.375, "epoch": 0.5308, "grad_norm": 0.1624824196668993, "kl": 0.186767578125, "learning_rate": 1.0669704810289852e-05, "loss": 0.035, "reward": 1.93359375, "reward_std": 0.24063260108232498, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1327 }, { "clip_ratio": 0.0, "completion_length": 753.65625, "epoch": 0.5312, "grad_norm": 0.201408405577865, "kl": 0.17724609375, "learning_rate": 1.0655772874699217e-05, "loss": 0.0426, "reward": 1.94921875, "reward_std": 0.1744270622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1328 }, { "clip_ratio": 0.0, "completion_length": 684.359375, "epoch": 0.5316, "grad_norm": 0.22579737655522295, "kl": 0.167724609375, "learning_rate": 1.0641839660645806e-05, "loss": 0.018, "reward": 2.041015625, "reward_std": 0.18453482538461685, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1329 }, { "clip_ratio": 0.0, "completion_length": 789.875, "epoch": 0.532, "grad_norm": 0.14498857575507726, "kl": 0.16943359375, "learning_rate": 1.0627905195293135e-05, "loss": 0.0318, "reward": 1.982421875, "reward_std": 0.1881999894976616, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1330 }, { "clip_ratio": 0.0, "completion_length": 714.7890625, "epoch": 0.5324, "grad_norm": 0.25586122074325857, "kl": 0.18603515625, "learning_rate": 1.0613969505807157e-05, "loss": 0.0912, "reward": 1.87890625, "reward_std": 0.31841064989566803, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1331 }, { "clip_ratio": 0.0, "completion_length": 726.5390625, "epoch": 0.5328, "grad_norm": 0.23486502593061367, "kl": 0.177978515625, "learning_rate": 1.0600032619356208e-05, "loss": 0.0467, "reward": 1.921875, "reward_std": 0.2528105527162552, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96875, "step": 1332 }, { "clip_ratio": 0.0, "completion_length": 771.015625, "epoch": 0.5332, "grad_norm": 0.28329654328406606, "kl": 0.181884765625, "learning_rate": 1.0586094563110965e-05, "loss": 0.103, "reward": 1.9921875, "reward_std": 0.47097743302583694, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 1333 }, { "clip_ratio": 0.0, "completion_length": 804.71875, "epoch": 0.5336, "grad_norm": 1.0017987309173957, "kl": 0.176513671875, "learning_rate": 1.0572155364244383e-05, "loss": 0.0476, "reward": 2.083984375, "reward_std": 0.31779052317142487, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1334 }, { "clip_ratio": 0.0, "completion_length": 778.5078125, "epoch": 0.534, "grad_norm": 0.5579894649531207, "kl": 0.1865234375, "learning_rate": 1.055821504993164e-05, "loss": 0.0738, "reward": 1.802734375, "reward_std": 0.417687363922596, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.912109375, "step": 1335 }, { "clip_ratio": 0.0, "completion_length": 776.0859375, "epoch": 0.5344, "grad_norm": 0.2821821018215359, "kl": 0.166259765625, "learning_rate": 1.0544273647350091e-05, "loss": 0.0327, "reward": 2.17578125, "reward_std": 0.3370901122689247, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1336 }, { "clip_ratio": 0.0, "completion_length": 718.484375, "epoch": 0.5348, "grad_norm": 0.31961039188433477, "kl": 0.170654296875, "learning_rate": 1.053033118367922e-05, "loss": 0.0412, "reward": 1.99609375, "reward_std": 0.23880057781934738, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1337 }, { "clip_ratio": 0.0, "completion_length": 768.46875, "epoch": 0.5352, "grad_norm": 0.34517381494566324, "kl": 0.180908203125, "learning_rate": 1.0516387686100566e-05, "loss": 0.0468, "reward": 1.865234375, "reward_std": 0.3965076506137848, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.927734375, "step": 1338 }, { "clip_ratio": 0.0, "completion_length": 714.1328125, "epoch": 0.5356, "grad_norm": 0.2457303848623411, "kl": 0.171142578125, "learning_rate": 1.0502443181797696e-05, "loss": 0.0894, "reward": 2.10546875, "reward_std": 0.43033287674188614, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94140625, "step": 1339 }, { "clip_ratio": 0.0, "completion_length": 721.578125, "epoch": 0.536, "grad_norm": 0.3224701818050502, "kl": 0.17138671875, "learning_rate": 1.0488497697956134e-05, "loss": 0.1151, "reward": 2.009765625, "reward_std": 0.5361466407775879, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.923828125, "step": 1340 }, { "clip_ratio": 0.0, "completion_length": 721.4609375, "epoch": 0.5364, "grad_norm": 0.9766067793554514, "kl": 0.204833984375, "learning_rate": 1.0474551261763315e-05, "loss": 0.0387, "reward": 1.91796875, "reward_std": 0.17636188864707947, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1341 }, { "clip_ratio": 0.0, "completion_length": 761.9453125, "epoch": 0.5368, "grad_norm": 0.3168295962904492, "kl": 0.16259765625, "learning_rate": 1.0460603900408523e-05, "loss": 0.1035, "reward": 1.857421875, "reward_std": 0.3995512127876282, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 1342 }, { "clip_ratio": 0.0, "completion_length": 743.1640625, "epoch": 0.5372, "grad_norm": 0.34791379322146454, "kl": 0.161865234375, "learning_rate": 1.0446655641082864e-05, "loss": 0.0981, "reward": 2.013671875, "reward_std": 0.4286602810025215, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.943359375, "step": 1343 }, { "clip_ratio": 0.0, "completion_length": 758.9453125, "epoch": 0.5376, "grad_norm": 0.49641638979191327, "kl": 0.1669921875, "learning_rate": 1.0432706510979172e-05, "loss": 0.0837, "reward": 2.0078125, "reward_std": 0.42388835549354553, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1344 }, { "clip_ratio": 0.0, "completion_length": 887.859375, "epoch": 0.538, "grad_norm": 1.0191580162479779, "kl": 0.16650390625, "learning_rate": 1.0418756537291996e-05, "loss": 0.031, "reward": 1.99609375, "reward_std": 0.43940193206071854, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.95703125, "step": 1345 }, { "clip_ratio": 0.0, "completion_length": 765.8984375, "epoch": 0.5384, "grad_norm": 1.336096956638865, "kl": 0.2021484375, "learning_rate": 1.0404805747217525e-05, "loss": 0.0992, "reward": 1.8515625, "reward_std": 0.44045451283454895, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1346 }, { "clip_ratio": 0.0, "completion_length": 760.6484375, "epoch": 0.5388, "grad_norm": 0.22552052803924044, "kl": 0.146484375, "learning_rate": 1.0390854167953537e-05, "loss": 0.0474, "reward": 2.07421875, "reward_std": 0.3540658876299858, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1347 }, { "clip_ratio": 0.0, "completion_length": 702.7890625, "epoch": 0.5392, "grad_norm": 0.5386706901743202, "kl": 0.156494140625, "learning_rate": 1.0376901826699349e-05, "loss": 0.0815, "reward": 1.96875, "reward_std": 0.27902641892433167, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1348 }, { "clip_ratio": 0.0, "completion_length": 803.2421875, "epoch": 0.5396, "grad_norm": 1.319034199979315, "kl": 0.20166015625, "learning_rate": 1.036294875065576e-05, "loss": 0.0667, "reward": 1.99609375, "reward_std": 0.3111616298556328, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1349 }, { "clip_ratio": 0.0, "completion_length": 751.375, "epoch": 0.54, "grad_norm": 0.24108003022614738, "kl": 0.179443359375, "learning_rate": 1.0348994967025012e-05, "loss": 0.064, "reward": 2.181640625, "reward_std": 0.37703944742679596, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1350 }, { "clip_ratio": 0.0, "completion_length": 767.4375, "epoch": 0.5404, "grad_norm": 0.4270063622444053, "kl": 0.175048828125, "learning_rate": 1.0335040503010715e-05, "loss": 0.0545, "reward": 1.953125, "reward_std": 0.3772137686610222, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1351 }, { "clip_ratio": 0.0, "completion_length": 747.515625, "epoch": 0.5408, "grad_norm": 0.45778022566547527, "kl": 0.177978515625, "learning_rate": 1.0321085385817818e-05, "loss": 0.0575, "reward": 2.013671875, "reward_std": 0.3438990116119385, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1352 }, { "clip_ratio": 0.0, "completion_length": 759.0703125, "epoch": 0.5412, "grad_norm": 0.2624436060594569, "kl": 0.1640625, "learning_rate": 1.030712964265253e-05, "loss": 0.0362, "reward": 2.037109375, "reward_std": 0.30588530004024506, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1353 }, { "clip_ratio": 0.0, "completion_length": 794.09375, "epoch": 0.5416, "grad_norm": 0.1567460238825755, "kl": 0.16845703125, "learning_rate": 1.0293173300722286e-05, "loss": 0.0323, "reward": 1.99609375, "reward_std": 0.30219484120607376, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1354 }, { "clip_ratio": 0.0, "completion_length": 771.0078125, "epoch": 0.542, "grad_norm": 0.12343077362612534, "kl": 0.15869140625, "learning_rate": 1.0279216387235691e-05, "loss": 0.0047, "reward": 2.193359375, "reward_std": 0.15068094432353973, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1355 }, { "clip_ratio": 0.0, "completion_length": 762.5546875, "epoch": 0.5424, "grad_norm": 0.23618089136801793, "kl": 0.18212890625, "learning_rate": 1.026525892940246e-05, "loss": 0.0344, "reward": 2.17578125, "reward_std": 0.28641147911548615, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1356 }, { "clip_ratio": 0.0, "completion_length": 801.1953125, "epoch": 0.5428, "grad_norm": 0.14954854652059452, "kl": 0.1669921875, "learning_rate": 1.0251300954433377e-05, "loss": 0.0585, "reward": 2.205078125, "reward_std": 0.33213087916374207, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 1357 }, { "clip_ratio": 0.0, "completion_length": 791.875, "epoch": 0.5432, "grad_norm": 0.22344315125836461, "kl": 0.160888671875, "learning_rate": 1.0237342489540221e-05, "loss": 0.043, "reward": 2.07421875, "reward_std": 0.203125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1358 }, { "clip_ratio": 0.0, "completion_length": 779.0078125, "epoch": 0.5436, "grad_norm": 0.24496739774694104, "kl": 0.171875, "learning_rate": 1.0223383561935738e-05, "loss": 0.0286, "reward": 2.064453125, "reward_std": 0.20752985030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1359 }, { "clip_ratio": 0.0, "completion_length": 751.1953125, "epoch": 0.544, "grad_norm": 0.21272139122976783, "kl": 0.166748046875, "learning_rate": 1.0209424198833571e-05, "loss": 0.0385, "reward": 2.037109375, "reward_std": 0.2493308112025261, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1360 }, { "clip_ratio": 0.0, "completion_length": 804.1875, "epoch": 0.5444, "grad_norm": 0.3925671903308364, "kl": 0.1494140625, "learning_rate": 1.0195464427448213e-05, "loss": 0.0862, "reward": 1.984375, "reward_std": 0.4385041743516922, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.953125, "step": 1361 }, { "clip_ratio": 0.0, "completion_length": 829.1171875, "epoch": 0.5448, "grad_norm": 0.20363350948363454, "kl": 0.1611328125, "learning_rate": 1.0181504274994949e-05, "loss": 0.062, "reward": 1.873046875, "reward_std": 0.43826617300510406, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 1362 }, { "clip_ratio": 0.0, "completion_length": 836.953125, "epoch": 0.5452, "grad_norm": 0.2735716786353401, "kl": 0.16943359375, "learning_rate": 1.0167543768689816e-05, "loss": 0.0373, "reward": 1.970703125, "reward_std": 0.43858057260513306, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.923828125, "step": 1363 }, { "clip_ratio": 0.0, "completion_length": 828.3984375, "epoch": 0.5456, "grad_norm": 0.4313639364361235, "kl": 0.173095703125, "learning_rate": 1.0153582935749531e-05, "loss": 0.0565, "reward": 1.91015625, "reward_std": 0.43588661774992943, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94140625, "step": 1364 }, { "clip_ratio": 0.0, "completion_length": 770.265625, "epoch": 0.546, "grad_norm": 0.971035258482782, "kl": 0.184814453125, "learning_rate": 1.0139621803391454e-05, "loss": 0.0556, "reward": 2.001953125, "reward_std": 0.3480696976184845, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1365 }, { "clip_ratio": 0.0, "completion_length": 856.6953125, "epoch": 0.5464, "grad_norm": 0.30316774810720787, "kl": 0.163330078125, "learning_rate": 1.0125660398833528e-05, "loss": 0.0389, "reward": 1.95703125, "reward_std": 0.44321518391370773, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.95703125, "step": 1366 }, { "clip_ratio": 0.0, "completion_length": 826.84375, "epoch": 0.5468, "grad_norm": 0.5616608444158718, "kl": 0.162353515625, "learning_rate": 1.0111698749294223e-05, "loss": 0.0477, "reward": 1.9453125, "reward_std": 0.3692111149430275, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 1367 }, { "clip_ratio": 0.0, "completion_length": 751.3203125, "epoch": 0.5472, "grad_norm": 0.234820184936036, "kl": 0.179443359375, "learning_rate": 1.0097736881992492e-05, "loss": 0.0308, "reward": 2.01171875, "reward_std": 0.16921419650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1368 }, { "clip_ratio": 0.0, "completion_length": 788.0078125, "epoch": 0.5476, "grad_norm": 2.5912327383817195, "kl": 0.158447265625, "learning_rate": 1.0083774824147707e-05, "loss": 0.0377, "reward": 1.904296875, "reward_std": 0.2234709933400154, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.974609375, "step": 1369 }, { "clip_ratio": 0.0, "completion_length": 784.75, "epoch": 0.548, "grad_norm": 0.30175996031947494, "kl": 0.187744140625, "learning_rate": 1.0069812602979617e-05, "loss": 0.0884, "reward": 1.880859375, "reward_std": 0.4318193346261978, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.943359375, "step": 1370 }, { "clip_ratio": 0.0, "completion_length": 793.40625, "epoch": 0.5484, "grad_norm": 0.20916046257552776, "kl": 0.171875, "learning_rate": 1.0055850245708283e-05, "loss": 0.0312, "reward": 2.177734375, "reward_std": 0.24118494987487793, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1371 }, { "clip_ratio": 0.0, "completion_length": 756.9765625, "epoch": 0.5488, "grad_norm": 0.18202063857803574, "kl": 0.1806640625, "learning_rate": 1.0041887779554041e-05, "loss": 0.0364, "reward": 1.9296875, "reward_std": 0.22035078704357147, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1372 }, { "clip_ratio": 0.0, "completion_length": 699.078125, "epoch": 0.5492, "grad_norm": 0.28939127507501217, "kl": 0.17236328125, "learning_rate": 1.0027925231737428e-05, "loss": 0.0259, "reward": 2.1875, "reward_std": 0.299833245575428, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1373 }, { "clip_ratio": 0.0, "completion_length": 662.3046875, "epoch": 0.5496, "grad_norm": 2.606295851257317, "kl": 0.314697265625, "learning_rate": 1.0013962629479145e-05, "loss": 0.1542, "reward": 1.95703125, "reward_std": 0.44229550659656525, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.94921875, "step": 1374 }, { "clip_ratio": 0.0, "completion_length": 760.5703125, "epoch": 0.55, "grad_norm": 0.2462477180665864, "kl": 0.176513671875, "learning_rate": 1e-05, "loss": 0.062, "reward": 1.990234375, "reward_std": 0.3176516965031624, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1375 }, { "clip_ratio": 0.0, "completion_length": 569.1796875, "epoch": 0.5504, "grad_norm": 0.22201123534472714, "kl": 0.19140625, "learning_rate": 9.986037370520856e-06, "loss": 0.0464, "reward": 2.205078125, "reward_std": 0.11058919876813889, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1376 }, { "clip_ratio": 0.0, "completion_length": 690.6015625, "epoch": 0.5508, "grad_norm": 0.30134436737382453, "kl": 0.17138671875, "learning_rate": 9.972074768262576e-06, "loss": 0.051, "reward": 1.99609375, "reward_std": 0.1597641110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1377 }, { "clip_ratio": 0.0, "completion_length": 675.65625, "epoch": 0.5512, "grad_norm": 0.22944579718525304, "kl": 0.1748046875, "learning_rate": 9.958112220445964e-06, "loss": 0.0314, "reward": 2.306640625, "reward_std": 0.2171521782875061, "rewards/accuracy_reward": 0.3203125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1378 }, { "clip_ratio": 0.0, "completion_length": 688.2265625, "epoch": 0.5516, "grad_norm": 0.29274861280940817, "kl": 0.19140625, "learning_rate": 9.944149754291719e-06, "loss": 0.0269, "reward": 2.283203125, "reward_std": 0.15657495707273483, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1379 }, { "clip_ratio": 0.0, "completion_length": 662.0703125, "epoch": 0.552, "grad_norm": 0.5053037156543032, "kl": 0.189208984375, "learning_rate": 9.930187397020385e-06, "loss": 0.0827, "reward": 1.95703125, "reward_std": 0.234375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1380 }, { "clip_ratio": 0.0, "completion_length": 684.03125, "epoch": 0.5524, "grad_norm": 0.550316756611576, "kl": 0.158935546875, "learning_rate": 9.916225175852295e-06, "loss": 0.0953, "reward": 1.91796875, "reward_std": 0.15940609574317932, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1381 }, { "clip_ratio": 0.0, "completion_length": 645.171875, "epoch": 0.5528, "grad_norm": 1.522188594112645, "kl": 0.201171875, "learning_rate": 9.902263118007513e-06, "loss": 0.0867, "reward": 2.05859375, "reward_std": 0.357396736741066, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1382 }, { "clip_ratio": 0.0, "completion_length": 691.6171875, "epoch": 0.5532, "grad_norm": 0.5034712390775323, "kl": 0.2109375, "learning_rate": 9.88830125070578e-06, "loss": 0.0194, "reward": 2.119140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1383 }, { "clip_ratio": 0.0, "completion_length": 638.5078125, "epoch": 0.5536, "grad_norm": 0.28304770380542427, "kl": 0.18994140625, "learning_rate": 9.874339601166474e-06, "loss": 0.0511, "reward": 2.14453125, "reward_std": 0.30703921616077423, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1384 }, { "clip_ratio": 0.0, "completion_length": 722.5703125, "epoch": 0.554, "grad_norm": 0.18922981831518906, "kl": 0.171875, "learning_rate": 9.860378196608549e-06, "loss": 0.0391, "reward": 2.07421875, "reward_std": 0.25203530490398407, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1385 }, { "clip_ratio": 0.0, "completion_length": 706.4140625, "epoch": 0.5544, "grad_norm": 11.27710732359484, "kl": 0.193115234375, "learning_rate": 9.84641706425047e-06, "loss": 0.0377, "reward": 2.154296875, "reward_std": 0.25360675901174545, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1386 }, { "clip_ratio": 0.0, "completion_length": 709.03125, "epoch": 0.5548, "grad_norm": 0.3331922985210231, "kl": 0.181884765625, "learning_rate": 9.832456231310189e-06, "loss": 0.0217, "reward": 2.189453125, "reward_std": 0.19267656654119492, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1387 }, { "clip_ratio": 0.0, "completion_length": 680.8359375, "epoch": 0.5552, "grad_norm": 1.0210235345150276, "kl": 0.175048828125, "learning_rate": 9.818495725005053e-06, "loss": 0.0826, "reward": 2.16796875, "reward_std": 0.25822463631629944, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1388 }, { "clip_ratio": 0.0, "completion_length": 682.609375, "epoch": 0.5556, "grad_norm": 0.2176986765963886, "kl": 0.18310546875, "learning_rate": 9.80453557255179e-06, "loss": 0.0368, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1389 }, { "clip_ratio": 0.0, "completion_length": 735.53125, "epoch": 0.556, "grad_norm": 0.58446624836293, "kl": 0.18310546875, "learning_rate": 9.790575801166432e-06, "loss": 0.0478, "reward": 2.025390625, "reward_std": 0.3532123938202858, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1390 }, { "clip_ratio": 0.0, "completion_length": 703.46875, "epoch": 0.5564, "grad_norm": 0.13444526698368195, "kl": 0.196044921875, "learning_rate": 9.776616438064265e-06, "loss": 0.0529, "reward": 1.962890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1391 }, { "clip_ratio": 0.0, "completion_length": 713.3203125, "epoch": 0.5568, "grad_norm": 0.3595465067303802, "kl": 0.185791015625, "learning_rate": 9.762657510459784e-06, "loss": 0.0999, "reward": 2.12109375, "reward_std": 0.3455469384789467, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1392 }, { "clip_ratio": 0.0, "completion_length": 774.25, "epoch": 0.5572, "grad_norm": 0.18860145954613738, "kl": 0.16845703125, "learning_rate": 9.748699045566626e-06, "loss": 0.0498, "reward": 1.9765625, "reward_std": 0.21323313564062119, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1393 }, { "clip_ratio": 0.0, "completion_length": 745.8984375, "epoch": 0.5576, "grad_norm": 0.2232619892046017, "kl": 0.176513671875, "learning_rate": 9.73474107059754e-06, "loss": 0.0525, "reward": 1.921875, "reward_std": 0.24318470060825348, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1394 }, { "clip_ratio": 0.0, "completion_length": 709.8046875, "epoch": 0.558, "grad_norm": 0.21671520859944543, "kl": 0.18359375, "learning_rate": 9.720783612764314e-06, "loss": 0.0912, "reward": 1.951171875, "reward_std": 0.43847958743572235, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.943359375, "step": 1395 }, { "clip_ratio": 0.0, "completion_length": 730.3359375, "epoch": 0.5584, "grad_norm": 0.31770223340573445, "kl": 0.186767578125, "learning_rate": 9.706826699277719e-06, "loss": 0.0363, "reward": 2.154296875, "reward_std": 0.25179213285446167, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1396 }, { "clip_ratio": 0.0, "completion_length": 787.625, "epoch": 0.5588, "grad_norm": 1.1644791484022572, "kl": 0.206787109375, "learning_rate": 9.692870357347474e-06, "loss": 0.0783, "reward": 1.896484375, "reward_std": 0.40814387053251266, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.951171875, "step": 1397 }, { "clip_ratio": 0.0, "completion_length": 797.5546875, "epoch": 0.5592, "grad_norm": 0.6571713891944984, "kl": 0.208740234375, "learning_rate": 9.678914614182185e-06, "loss": 0.0734, "reward": 1.82421875, "reward_std": 0.4299019128084183, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.92578125, "step": 1398 }, { "clip_ratio": 0.0, "completion_length": 786.1796875, "epoch": 0.5596, "grad_norm": 0.27035154852431253, "kl": 0.162353515625, "learning_rate": 9.664959496989286e-06, "loss": 0.0384, "reward": 2.234375, "reward_std": 0.3665374889969826, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1399 }, { "clip_ratio": 0.0, "completion_length": 834.7578125, "epoch": 0.56, "grad_norm": 19.00775458991425, "kl": 0.198974609375, "learning_rate": 9.651005032974994e-06, "loss": 0.0672, "reward": 1.876953125, "reward_std": 0.5047046169638634, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.931640625, "step": 1400 }, { "clip_ratio": 0.0, "completion_length": 761.3828125, "epoch": 0.5604, "grad_norm": 279.924141324198, "kl": 1.59375, "learning_rate": 9.637051249344244e-06, "loss": 0.0985, "reward": 2.01171875, "reward_std": 0.24411680549383163, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 1401 }, { "clip_ratio": 0.0, "completion_length": 730.65625, "epoch": 0.5608, "grad_norm": 28315.823643708172, "kl": 496.0, "learning_rate": 9.623098173300655e-06, "loss": 19.8019, "reward": 1.90625, "reward_std": 0.30933766812086105, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9609375, "step": 1402 }, { "clip_ratio": 0.0, "completion_length": 859.3828125, "epoch": 0.5612, "grad_norm": 25747.71588740549, "kl": 800.5, "learning_rate": 9.609145832046465e-06, "loss": 32.4832, "reward": 1.76953125, "reward_std": 0.5882468968629837, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.89453125, "step": 1403 }, { "clip_ratio": 0.0, "completion_length": 836.640625, "epoch": 0.5616, "grad_norm": 10588.644157459355, "kl": 182.5, "learning_rate": 9.595194252782476e-06, "loss": 7.2157, "reward": 1.767578125, "reward_std": 0.554316408932209, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.892578125, "step": 1404 }, { "clip_ratio": 0.0, "completion_length": 860.921875, "epoch": 0.562, "grad_norm": 15428.215105217794, "kl": 160.625, "learning_rate": 9.581243462708007e-06, "loss": 5.8585, "reward": 1.443359375, "reward_std": 0.8024163544178009, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.640625, "rewards/tag_count_reward": 0.732421875, "step": 1405 }, { "clip_ratio": 0.0, "completion_length": 751.484375, "epoch": 0.5624, "grad_norm": 647.9517155526796, "kl": 7.609375, "learning_rate": 9.567293489020831e-06, "loss": 0.4813, "reward": 1.623046875, "reward_std": 0.7811601608991623, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.7421875, "rewards/tag_count_reward": 0.818359375, "step": 1406 }, { "clip_ratio": 0.0, "completion_length": 893.828125, "epoch": 0.5628, "grad_norm": 50.13742715202977, "kl": 0.59814453125, "learning_rate": 9.553344358917141e-06, "loss": 0.1162, "reward": 1.494140625, "reward_std": 0.806006133556366, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.6796875, "rewards/tag_count_reward": 0.791015625, "step": 1407 }, { "clip_ratio": 0.0, "completion_length": 867.4296875, "epoch": 0.5632, "grad_norm": 50.08262938619216, "kl": 0.61962890625, "learning_rate": 9.539396099591477e-06, "loss": 0.175, "reward": 1.541015625, "reward_std": 0.9876844733953476, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.5390625, "rewards/tag_count_reward": 0.658203125, "step": 1408 }, { "clip_ratio": 0.0, "completion_length": 929.28125, "epoch": 0.5636, "grad_norm": 6.780865246428064, "kl": 0.5400390625, "learning_rate": 9.525448738236691e-06, "loss": 0.155, "reward": 0.818359375, "reward_std": 0.8371032923460007, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.2890625, "rewards/tag_count_reward": 0.482421875, "step": 1409 }, { "clip_ratio": 0.0, "completion_length": 905.625, "epoch": 0.564, "grad_norm": 19.379957197873626, "kl": 0.64990234375, "learning_rate": 9.511502302043867e-06, "loss": 0.1902, "reward": 1.046875, "reward_std": 0.8670955449342728, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.375, "rewards/tag_count_reward": 0.5390625, "step": 1410 }, { "clip_ratio": 0.0, "completion_length": 913.1171875, "epoch": 0.5644, "grad_norm": 15.37342133657563, "kl": 0.4375, "learning_rate": 9.497556818202306e-06, "loss": 0.1586, "reward": 1.080078125, "reward_std": 0.8207687437534332, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.3984375, "rewards/tag_count_reward": 0.556640625, "step": 1411 }, { "clip_ratio": 0.0, "completion_length": 925.2109375, "epoch": 0.5648, "grad_norm": 1.0373213526241463, "kl": 0.255859375, "learning_rate": 9.483612313899436e-06, "loss": 0.1289, "reward": 1.240234375, "reward_std": 0.8381908982992172, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.4140625, "rewards/tag_count_reward": 0.576171875, "step": 1412 }, { "clip_ratio": 0.0, "completion_length": 807.59375, "epoch": 0.5652, "grad_norm": 126.99527822390115, "kl": 2.584228515625, "learning_rate": 9.469668816320785e-06, "loss": 0.2989, "reward": 1.47265625, "reward_std": 0.9106041789054871, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.65625, "rewards/tag_count_reward": 0.74609375, "step": 1413 }, { "clip_ratio": 0.0, "completion_length": 803.71875, "epoch": 0.5656, "grad_norm": 3.2964316165785936, "kl": 0.239013671875, "learning_rate": 9.45572635264991e-06, "loss": 0.0866, "reward": 1.998046875, "reward_std": 0.5463502481579781, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.904296875, "step": 1414 }, { "clip_ratio": 0.0, "completion_length": 807.8125, "epoch": 0.566, "grad_norm": 0.9639910252922543, "kl": 0.18212890625, "learning_rate": 9.441784950068362e-06, "loss": 0.0848, "reward": 1.779296875, "reward_std": 0.5358924493193626, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.888671875, "step": 1415 }, { "clip_ratio": 0.0, "completion_length": 853.390625, "epoch": 0.5664, "grad_norm": 0.4653928049548024, "kl": 0.18505859375, "learning_rate": 9.42784463575562e-06, "loss": 0.0348, "reward": 1.69921875, "reward_std": 0.4130070060491562, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.796875, "rewards/tag_count_reward": 0.89453125, "step": 1416 }, { "clip_ratio": 0.0, "completion_length": 684.546875, "epoch": 0.5668, "grad_norm": 0.4627188877370346, "kl": 0.171142578125, "learning_rate": 9.413905436889035e-06, "loss": 0.0268, "reward": 2.111328125, "reward_std": 0.3274032697081566, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1417 }, { "clip_ratio": 0.0, "completion_length": 823.7578125, "epoch": 0.5672, "grad_norm": 0.32299478652566005, "kl": 0.1572265625, "learning_rate": 9.399967380643795e-06, "loss": 0.0484, "reward": 1.88671875, "reward_std": 0.43303969502449036, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.93359375, "step": 1418 }, { "clip_ratio": 0.0, "completion_length": 752.6953125, "epoch": 0.5676, "grad_norm": 0.2864246626657245, "kl": 0.155029296875, "learning_rate": 9.386030494192847e-06, "loss": 0.0738, "reward": 1.92578125, "reward_std": 0.3931729793548584, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94921875, "step": 1419 }, { "clip_ratio": 0.0, "completion_length": 823.125, "epoch": 0.568, "grad_norm": 0.27199068479605737, "kl": 0.165771484375, "learning_rate": 9.372094804706867e-06, "loss": 0.0432, "reward": 1.927734375, "reward_std": 0.3750196099281311, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 1420 }, { "clip_ratio": 0.0, "completion_length": 728.6171875, "epoch": 0.5684, "grad_norm": 0.22777258815402326, "kl": 0.18115234375, "learning_rate": 9.358160339354194e-06, "loss": 0.0424, "reward": 1.98828125, "reward_std": 0.193240474909544, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1421 }, { "clip_ratio": 0.0, "completion_length": 594.5703125, "epoch": 0.5688, "grad_norm": 0.3418303996675255, "kl": 0.174560546875, "learning_rate": 9.344227125300788e-06, "loss": 0.1123, "reward": 1.966796875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1422 }, { "clip_ratio": 0.0, "completion_length": 687.9375, "epoch": 0.5692, "grad_norm": 0.17520393505241053, "kl": 0.1591796875, "learning_rate": 9.330295189710153e-06, "loss": 0.0408, "reward": 2.04296875, "reward_std": 0.22511589527130127, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1423 }, { "clip_ratio": 0.0, "completion_length": 681.5703125, "epoch": 0.5696, "grad_norm": 0.31715181049311175, "kl": 0.170654296875, "learning_rate": 9.316364559743315e-06, "loss": 0.0447, "reward": 2.1015625, "reward_std": 0.27934853732585907, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1424 }, { "clip_ratio": 0.0, "completion_length": 671.3828125, "epoch": 0.57, "grad_norm": 0.33857390747789523, "kl": 0.187255859375, "learning_rate": 9.302435262558748e-06, "loss": 0.0132, "reward": 2.25, "reward_std": 0.12433473765850067, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1425 }, { "clip_ratio": 0.0, "completion_length": 636.71875, "epoch": 0.5704, "grad_norm": 0.2715058635567726, "kl": 0.157470703125, "learning_rate": 9.288507325312334e-06, "loss": 0.0265, "reward": 2.099609375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1426 }, { "clip_ratio": 0.0, "completion_length": 733.703125, "epoch": 0.5708, "grad_norm": 0.21725362345625915, "kl": 0.16748046875, "learning_rate": 9.274580775157294e-06, "loss": 0.0508, "reward": 1.978515625, "reward_std": 0.29452355951070786, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1427 }, { "clip_ratio": 0.0, "completion_length": 673.484375, "epoch": 0.5712, "grad_norm": 0.21438376514900248, "kl": 0.1668701171875, "learning_rate": 9.260655639244152e-06, "loss": 0.0241, "reward": 2.0234375, "reward_std": 0.1712053380906582, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1428 }, { "clip_ratio": 0.0, "completion_length": 656.734375, "epoch": 0.5716, "grad_norm": 0.25431282798357935, "kl": 0.1630859375, "learning_rate": 9.246731944720675e-06, "loss": 0.0236, "reward": 2.130859375, "reward_std": 0.19719929993152618, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1429 }, { "clip_ratio": 0.0, "completion_length": 670.2421875, "epoch": 0.572, "grad_norm": 0.16272744072565717, "kl": 0.160400390625, "learning_rate": 9.232809718731815e-06, "loss": 0.0187, "reward": 2.126953125, "reward_std": 0.09738312661647797, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1430 }, { "clip_ratio": 0.0, "completion_length": 586.171875, "epoch": 0.5724, "grad_norm": 0.2750465634935259, "kl": 0.162841796875, "learning_rate": 9.218888988419668e-06, "loss": 0.0498, "reward": 1.986328125, "reward_std": 0.20121560245752335, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1431 }, { "clip_ratio": 0.0, "completion_length": 731.0859375, "epoch": 0.5728, "grad_norm": 0.19750633271974088, "kl": 0.16650390625, "learning_rate": 9.204969780923404e-06, "loss": 0.002, "reward": 2.1640625, "reward_std": 0.14568255096673965, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1432 }, { "clip_ratio": 0.0, "completion_length": 642.1640625, "epoch": 0.5732, "grad_norm": 0.1601681560247271, "kl": 0.174072265625, "learning_rate": 9.191052123379234e-06, "loss": 0.0343, "reward": 2.123046875, "reward_std": 0.20328575372695923, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1433 }, { "clip_ratio": 0.0, "completion_length": 669.625, "epoch": 0.5736, "grad_norm": 0.13371795453620594, "kl": 0.163818359375, "learning_rate": 9.177136042920344e-06, "loss": 0.0188, "reward": 2.109375, "reward_std": 0.042695626616477966, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1434 }, { "clip_ratio": 0.0, "completion_length": 666.140625, "epoch": 0.574, "grad_norm": 0.16640009678657647, "kl": 0.173583984375, "learning_rate": 9.163221566676847e-06, "loss": 0.009, "reward": 2.017578125, "reward_std": 0.11515908688306808, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1435 }, { "clip_ratio": 0.0, "completion_length": 670.0859375, "epoch": 0.5744, "grad_norm": 0.19039912196323922, "kl": 0.147705078125, "learning_rate": 9.14930872177572e-06, "loss": 0.0177, "reward": 2.046875, "reward_std": 0.16507697105407715, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1436 }, { "clip_ratio": 0.0, "completion_length": 729.90625, "epoch": 0.5748, "grad_norm": 0.3229324166671009, "kl": 0.162841796875, "learning_rate": 9.135397535340773e-06, "loss": 0.0503, "reward": 2.046875, "reward_std": 0.18954972177743912, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1437 }, { "clip_ratio": 0.0, "completion_length": 732.84375, "epoch": 0.5752, "grad_norm": 0.13181954857178438, "kl": 0.17626953125, "learning_rate": 9.121488034492569e-06, "loss": 0.0248, "reward": 2.1171875, "reward_std": 0.1116529181599617, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1438 }, { "clip_ratio": 0.0, "completion_length": 644.4765625, "epoch": 0.5756, "grad_norm": 0.19327457813363638, "kl": 0.1884765625, "learning_rate": 9.107580246348395e-06, "loss": 0.0313, "reward": 1.994140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1439 }, { "clip_ratio": 0.0, "completion_length": 657.5625, "epoch": 0.576, "grad_norm": 0.3359798455772849, "kl": 0.181396484375, "learning_rate": 9.093674198022201e-06, "loss": 0.0212, "reward": 2.00390625, "reward_std": 0.1910141110420227, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1440 }, { "clip_ratio": 0.0, "completion_length": 741.3359375, "epoch": 0.5764, "grad_norm": 0.1946203688625386, "kl": 0.16259765625, "learning_rate": 9.07976991662453e-06, "loss": 0.0436, "reward": 2.00390625, "reward_std": 0.16527669876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1441 }, { "clip_ratio": 0.0, "completion_length": 746.5, "epoch": 0.5768, "grad_norm": 0.19025128874334316, "kl": 0.15185546875, "learning_rate": 9.065867429262497e-06, "loss": 0.0329, "reward": 2.013671875, "reward_std": 0.2250283658504486, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1442 }, { "clip_ratio": 0.0, "completion_length": 745.71875, "epoch": 0.5772, "grad_norm": 0.24571771333450257, "kl": 0.163330078125, "learning_rate": 9.051966763039706e-06, "loss": 0.0724, "reward": 2.0, "reward_std": 0.25726838409900665, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1443 }, { "clip_ratio": 0.0, "completion_length": 716.3515625, "epoch": 0.5776, "grad_norm": 0.26237895621190177, "kl": 0.16259765625, "learning_rate": 9.038067945056229e-06, "loss": 0.1338, "reward": 2.134765625, "reward_std": 0.5737144947052002, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.939453125, "step": 1444 }, { "clip_ratio": 0.0, "completion_length": 735.8046875, "epoch": 0.578, "grad_norm": 0.18106545085635642, "kl": 0.1552734375, "learning_rate": 9.024171002408507e-06, "loss": 0.0772, "reward": 1.9375, "reward_std": 0.29112907499074936, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9609375, "step": 1445 }, { "clip_ratio": 0.0, "completion_length": 706.9453125, "epoch": 0.5784, "grad_norm": 0.21339017270506996, "kl": 0.160400390625, "learning_rate": 9.01027596218935e-06, "loss": 0.0422, "reward": 1.994140625, "reward_std": 0.25200794637203217, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1446 }, { "clip_ratio": 0.0, "completion_length": 697.1484375, "epoch": 0.5788, "grad_norm": 0.38627684471732615, "kl": 0.144287109375, "learning_rate": 8.996382851487851e-06, "loss": 0.067, "reward": 2.0234375, "reward_std": 0.37396930903196335, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96875, "step": 1447 }, { "clip_ratio": 0.0, "completion_length": 776.4296875, "epoch": 0.5792, "grad_norm": 0.13113565497946209, "kl": 0.156494140625, "learning_rate": 8.982491697389339e-06, "loss": 0.0325, "reward": 2.083984375, "reward_std": 0.12940484285354614, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1448 }, { "clip_ratio": 0.0, "completion_length": 769.2890625, "epoch": 0.5796, "grad_norm": 1.269778212947387, "kl": 0.17333984375, "learning_rate": 8.968602526975329e-06, "loss": 0.0559, "reward": 1.986328125, "reward_std": 0.3635673448443413, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1449 }, { "clip_ratio": 0.0, "completion_length": 710.8125, "epoch": 0.58, "grad_norm": 0.13808840574910106, "kl": 0.1639404296875, "learning_rate": 8.954715367323468e-06, "loss": 0.0339, "reward": 2.021484375, "reward_std": 0.16948182880878448, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1450 }, { "clip_ratio": 0.0, "completion_length": 797.890625, "epoch": 0.5804, "grad_norm": 0.2793258294102108, "kl": 0.161376953125, "learning_rate": 8.940830245507483e-06, "loss": 0.0367, "reward": 1.900390625, "reward_std": 0.2914903238415718, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.962890625, "step": 1451 }, { "clip_ratio": 0.0, "completion_length": 694.2109375, "epoch": 0.5808, "grad_norm": 0.30173377165490967, "kl": 0.1630859375, "learning_rate": 8.926947188597133e-06, "loss": 0.0313, "reward": 2.0625, "reward_std": 0.37188366055488586, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1452 }, { "clip_ratio": 0.0, "completion_length": 724.96875, "epoch": 0.5812, "grad_norm": 0.2529808703853582, "kl": 0.171142578125, "learning_rate": 8.913066223658152e-06, "loss": 0.0397, "reward": 1.99609375, "reward_std": 0.23632918298244476, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1453 }, { "clip_ratio": 0.0, "completion_length": 699.3359375, "epoch": 0.5816, "grad_norm": 0.27370045287800154, "kl": 0.17724609375, "learning_rate": 8.89918737775218e-06, "loss": 0.0262, "reward": 2.013671875, "reward_std": 0.28463155031204224, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1454 }, { "clip_ratio": 0.0, "completion_length": 663.28125, "epoch": 0.582, "grad_norm": 0.34075464648347104, "kl": 0.160888671875, "learning_rate": 8.885310677936746e-06, "loss": 0.0593, "reward": 2.068359375, "reward_std": 0.27849505096673965, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 1455 }, { "clip_ratio": 0.0, "completion_length": 672.0859375, "epoch": 0.5824, "grad_norm": 0.3013948275382568, "kl": 0.149169921875, "learning_rate": 8.871436151265183e-06, "loss": 0.0321, "reward": 2.32421875, "reward_std": 0.19652669876813889, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1456 }, { "clip_ratio": 0.0, "completion_length": 773.28125, "epoch": 0.5828, "grad_norm": 0.2169823506104224, "kl": 0.162841796875, "learning_rate": 8.857563824786598e-06, "loss": 0.0495, "reward": 2.119140625, "reward_std": 0.31290576606988907, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1457 }, { "clip_ratio": 0.0, "completion_length": 689.6875, "epoch": 0.5832, "grad_norm": 0.20341890506649168, "kl": 0.142578125, "learning_rate": 8.843693725545787e-06, "loss": 0.0853, "reward": 1.984375, "reward_std": 0.20325922966003418, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1458 }, { "clip_ratio": 0.0, "completion_length": 638.2578125, "epoch": 0.5836, "grad_norm": 0.39131679242557055, "kl": 0.172119140625, "learning_rate": 8.829825880583228e-06, "loss": 0.0456, "reward": 1.9921875, "reward_std": 0.23897846043109894, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1459 }, { "clip_ratio": 0.0, "completion_length": 741.59375, "epoch": 0.584, "grad_norm": 0.20296865416812393, "kl": 0.164794921875, "learning_rate": 8.815960316934991e-06, "loss": 0.0608, "reward": 1.904296875, "reward_std": 0.3134971931576729, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1460 }, { "clip_ratio": 0.0, "completion_length": 589.46875, "epoch": 0.5844, "grad_norm": 0.22834076183527113, "kl": 0.17333984375, "learning_rate": 8.802097061632706e-06, "loss": 0.0384, "reward": 2.25390625, "reward_std": 0.27484130859375, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1461 }, { "clip_ratio": 0.0, "completion_length": 634.671875, "epoch": 0.5848, "grad_norm": 0.2094591443076428, "kl": 0.167724609375, "learning_rate": 8.788236141703498e-06, "loss": 0.0695, "reward": 1.9765625, "reward_std": 0.19894562661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1462 }, { "clip_ratio": 0.0, "completion_length": 657.5390625, "epoch": 0.5852, "grad_norm": 0.24161723771706498, "kl": 0.1640625, "learning_rate": 8.774377584169934e-06, "loss": 0.0667, "reward": 1.947265625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1463 }, { "clip_ratio": 0.0, "completion_length": 743.875, "epoch": 0.5856, "grad_norm": 0.19491211769747113, "kl": 0.15234375, "learning_rate": 8.760521416049983e-06, "loss": 0.0116, "reward": 2.005859375, "reward_std": 0.08175812661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1464 }, { "clip_ratio": 0.0, "completion_length": 684.6015625, "epoch": 0.586, "grad_norm": 0.25818576752043637, "kl": 0.16064453125, "learning_rate": 8.746667664356957e-06, "loss": 0.0691, "reward": 2.166015625, "reward_std": 0.32255594432353973, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1465 }, { "clip_ratio": 0.0, "completion_length": 662.46875, "epoch": 0.5864, "grad_norm": 0.2544059985317011, "kl": 0.1513671875, "learning_rate": 8.732816356099455e-06, "loss": 0.0454, "reward": 2.0859375, "reward_std": 0.1753891110420227, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1466 }, { "clip_ratio": 0.0, "completion_length": 698.453125, "epoch": 0.5868, "grad_norm": 0.5118562665363323, "kl": 0.15087890625, "learning_rate": 8.718967518281307e-06, "loss": 0.0318, "reward": 2.04296875, "reward_std": 0.109375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1467 }, { "clip_ratio": 0.0, "completion_length": 723.8515625, "epoch": 0.5872, "grad_norm": 0.2911337791485127, "kl": 0.189208984375, "learning_rate": 8.705121177901532e-06, "loss": 0.0216, "reward": 1.990234375, "reward_std": 0.2585003525018692, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1468 }, { "clip_ratio": 0.0, "completion_length": 786.2734375, "epoch": 0.5876, "grad_norm": 3.163295885104741, "kl": 0.180419921875, "learning_rate": 8.69127736195428e-06, "loss": 0.096, "reward": 1.984375, "reward_std": 0.5241260528564453, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 1469 }, { "clip_ratio": 0.0, "completion_length": 667.7421875, "epoch": 0.588, "grad_norm": 0.15241225153668847, "kl": 0.1494140625, "learning_rate": 8.677436097428775e-06, "loss": 0.0491, "reward": 2.076171875, "reward_std": 0.20094293355941772, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1470 }, { "clip_ratio": 0.0, "completion_length": 686.484375, "epoch": 0.5884, "grad_norm": 0.3005460581687042, "kl": 0.159912109375, "learning_rate": 8.663597411309278e-06, "loss": 0.0302, "reward": 2.119140625, "reward_std": 0.22772305458784103, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1471 }, { "clip_ratio": 0.0, "completion_length": 702.6875, "epoch": 0.5888, "grad_norm": 0.2231040027070013, "kl": 0.146484375, "learning_rate": 8.649761330575009e-06, "loss": 0.0114, "reward": 2.017578125, "reward_std": 0.15988312661647797, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1472 }, { "clip_ratio": 0.0, "completion_length": 663.8515625, "epoch": 0.5892, "grad_norm": 0.2813369668490214, "kl": 0.1578369140625, "learning_rate": 8.635927882200117e-06, "loss": 0.0764, "reward": 1.9453125, "reward_std": 0.32287635654211044, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1473 }, { "clip_ratio": 0.0, "completion_length": 721.1171875, "epoch": 0.5896, "grad_norm": 0.2585359329173019, "kl": 0.17236328125, "learning_rate": 8.62209709315362e-06, "loss": 0.0545, "reward": 1.98046875, "reward_std": 0.26846619695425034, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1474 }, { "clip_ratio": 0.0, "completion_length": 766.6640625, "epoch": 0.59, "grad_norm": 0.18488478086304952, "kl": 0.140625, "learning_rate": 8.60826899039935e-06, "loss": 0.0334, "reward": 2.08984375, "reward_std": 0.19805096089839935, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1475 }, { "clip_ratio": 0.0, "completion_length": 681.4921875, "epoch": 0.5904, "grad_norm": 0.20009813570955592, "kl": 0.1456298828125, "learning_rate": 8.594443600895892e-06, "loss": 0.0544, "reward": 1.98046875, "reward_std": 0.32781410217285156, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1476 }, { "clip_ratio": 0.0, "completion_length": 677.1015625, "epoch": 0.5908, "grad_norm": 0.24425417023617244, "kl": 0.16162109375, "learning_rate": 8.580620951596556e-06, "loss": 0.0443, "reward": 2.05078125, "reward_std": 0.20466843992471695, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1477 }, { "clip_ratio": 0.0, "completion_length": 636.6171875, "epoch": 0.5912, "grad_norm": 0.23989084829593657, "kl": 0.1630859375, "learning_rate": 8.566801069449307e-06, "loss": 0.1135, "reward": 2.17578125, "reward_std": 0.21885646134614944, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1478 }, { "clip_ratio": 0.0, "completion_length": 707.671875, "epoch": 0.5916, "grad_norm": 0.27290766777564107, "kl": 0.158447265625, "learning_rate": 8.552983981396709e-06, "loss": 0.0178, "reward": 2.10546875, "reward_std": 0.2581626921892166, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1479 }, { "clip_ratio": 0.0, "completion_length": 729.015625, "epoch": 0.592, "grad_norm": 0.2953208708885324, "kl": 0.137939453125, "learning_rate": 8.539169714375885e-06, "loss": 0.0465, "reward": 2.033203125, "reward_std": 0.26887524127960205, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1480 }, { "clip_ratio": 0.0, "completion_length": 739.0078125, "epoch": 0.5924, "grad_norm": 0.19190867665054223, "kl": 0.156494140625, "learning_rate": 8.525358295318454e-06, "loss": 0.0276, "reward": 1.94921875, "reward_std": 0.16846735030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1481 }, { "clip_ratio": 0.0, "completion_length": 696.734375, "epoch": 0.5928, "grad_norm": 0.22405560785080772, "kl": 0.146484375, "learning_rate": 8.511549751150478e-06, "loss": 0.031, "reward": 1.9921875, "reward_std": 0.20594120025634766, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1482 }, { "clip_ratio": 0.0, "completion_length": 677.8046875, "epoch": 0.5932, "grad_norm": 0.3292245663031925, "kl": 0.14990234375, "learning_rate": 8.49774410879243e-06, "loss": 0.0122, "reward": 2.1015625, "reward_std": 0.15434854477643967, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1483 }, { "clip_ratio": 0.0, "completion_length": 665.515625, "epoch": 0.5936, "grad_norm": 0.10930731042522826, "kl": 0.14453125, "learning_rate": 8.483941395159114e-06, "loss": 0.0013, "reward": 2.1015625, "reward_std": 0.050389111042022705, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1484 }, { "clip_ratio": 0.0, "completion_length": 663.9375, "epoch": 0.594, "grad_norm": 0.3080602898318493, "kl": 0.156494140625, "learning_rate": 8.47014163715962e-06, "loss": 0.0029, "reward": 2.23828125, "reward_std": 0.2825930714607239, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1485 }, { "clip_ratio": 0.0, "completion_length": 657.3671875, "epoch": 0.5944, "grad_norm": 0.2229079587291245, "kl": 0.1591796875, "learning_rate": 8.45634486169729e-06, "loss": 0.0222, "reward": 2.15625, "reward_std": 0.2191799208521843, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1486 }, { "clip_ratio": 0.0, "completion_length": 688.3125, "epoch": 0.5948, "grad_norm": 1.7524628984453967, "kl": 0.1650390625, "learning_rate": 8.44255109566964e-06, "loss": 0.0567, "reward": 2.267578125, "reward_std": 0.3453032970428467, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1487 }, { "clip_ratio": 0.0, "completion_length": 742.8046875, "epoch": 0.5952, "grad_norm": 0.3610720755923232, "kl": 0.162109375, "learning_rate": 8.428760365968327e-06, "loss": 0.0323, "reward": 2.10546875, "reward_std": 0.19993264973163605, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1488 }, { "clip_ratio": 0.0, "completion_length": 728.9140625, "epoch": 0.5956, "grad_norm": 0.22088129744832508, "kl": 0.151123046875, "learning_rate": 8.414972699479076e-06, "loss": 0.0327, "reward": 2.09375, "reward_std": 0.20544016361236572, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1489 }, { "clip_ratio": 0.0, "completion_length": 636.171875, "epoch": 0.596, "grad_norm": 0.28153520644322305, "kl": 0.173095703125, "learning_rate": 8.401188123081653e-06, "loss": 0.093, "reward": 2.337890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1490 }, { "clip_ratio": 0.0, "completion_length": 684.9921875, "epoch": 0.5964, "grad_norm": 0.2396949255513091, "kl": 0.14892578125, "learning_rate": 8.387406663649796e-06, "loss": 0.065, "reward": 1.99609375, "reward_std": 0.1585279181599617, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1491 }, { "clip_ratio": 0.0, "completion_length": 675.1796875, "epoch": 0.5968, "grad_norm": 0.24685607452064667, "kl": 0.14111328125, "learning_rate": 8.373628348051165e-06, "loss": 0.0163, "reward": 2.33203125, "reward_std": 0.26716843992471695, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1492 }, { "clip_ratio": 0.0, "completion_length": 649.890625, "epoch": 0.5972, "grad_norm": 1.2407131527769921, "kl": 0.232666015625, "learning_rate": 8.35985320314729e-06, "loss": 0.0362, "reward": 2.18359375, "reward_std": 0.22307763248682022, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1493 }, { "clip_ratio": 0.0, "completion_length": 742.6796875, "epoch": 0.5976, "grad_norm": 0.2173155266168518, "kl": 0.137939453125, "learning_rate": 8.346081255793524e-06, "loss": 0.0552, "reward": 1.935546875, "reward_std": 0.2578125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1494 }, { "clip_ratio": 0.0, "completion_length": 621.046875, "epoch": 0.598, "grad_norm": 0.3216395639187231, "kl": 0.144287109375, "learning_rate": 8.332312532838978e-06, "loss": 0.0787, "reward": 2.2734375, "reward_std": 0.27224497497081757, "rewards/accuracy_reward": 0.3203125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1495 }, { "clip_ratio": 0.0, "completion_length": 810.890625, "epoch": 0.5984, "grad_norm": 0.37955052592463057, "kl": 0.160400390625, "learning_rate": 8.318547061126485e-06, "loss": 0.0472, "reward": 2.048828125, "reward_std": 0.36476781964302063, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1496 }, { "clip_ratio": 0.0, "completion_length": 711.5234375, "epoch": 0.5988, "grad_norm": 0.26207413831520737, "kl": 0.155517578125, "learning_rate": 8.30478486749254e-06, "loss": 0.0186, "reward": 2.001953125, "reward_std": 0.21131285279989243, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1497 }, { "clip_ratio": 0.0, "completion_length": 660.5, "epoch": 0.5992, "grad_norm": 0.28507823356472506, "kl": 0.15869140625, "learning_rate": 8.291025978767236e-06, "loss": 0.0484, "reward": 2.34765625, "reward_std": 0.3033473566174507, "rewards/accuracy_reward": 0.3828125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 1498 }, { "clip_ratio": 0.0, "completion_length": 722.6171875, "epoch": 0.5996, "grad_norm": 0.47982543472108957, "kl": 0.1541748046875, "learning_rate": 8.277270421774234e-06, "loss": 0.0731, "reward": 2.0078125, "reward_std": 0.40708088874816895, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1499 }, { "clip_ratio": 0.0, "completion_length": 742.984375, "epoch": 0.6, "grad_norm": 0.2936903749372212, "kl": 0.15966796875, "learning_rate": 8.263518223330698e-06, "loss": 0.0498, "reward": 2.162109375, "reward_std": 0.2953966557979584, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 660.2109375, "epoch": 0.6004, "grad_norm": 0.5196772046256147, "kl": 0.1806640625, "learning_rate": 8.249769410247239e-06, "loss": 0.0716, "reward": 1.994140625, "reward_std": 0.2038249894976616, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1501 }, { "clip_ratio": 0.0, "completion_length": 690.609375, "epoch": 0.6008, "grad_norm": 0.3104189244895093, "kl": 0.193115234375, "learning_rate": 8.236024009327879e-06, "loss": 0.0797, "reward": 2.3984375, "reward_std": 0.3828677833080292, "rewards/accuracy_reward": 0.4921875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 1502 }, { "clip_ratio": 0.0, "completion_length": 752.25, "epoch": 0.6012, "grad_norm": 0.690032629809752, "kl": 0.15771484375, "learning_rate": 8.222282047369972e-06, "loss": 0.0749, "reward": 1.919921875, "reward_std": 0.2901762127876282, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1503 }, { "clip_ratio": 0.0, "completion_length": 697.6953125, "epoch": 0.6016, "grad_norm": 0.3549872142329855, "kl": 0.169921875, "learning_rate": 8.208543551164178e-06, "loss": 0.1118, "reward": 2.060546875, "reward_std": 0.28565485030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1504 }, { "clip_ratio": 0.0, "completion_length": 597.875, "epoch": 0.602, "grad_norm": 0.3681230081600037, "kl": 0.14501953125, "learning_rate": 8.194808547494401e-06, "loss": 0.093, "reward": 1.9140625, "reward_std": 0.33230943232774734, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1505 }, { "clip_ratio": 0.0, "completion_length": 715.4453125, "epoch": 0.6024, "grad_norm": 0.1532767515952219, "kl": 0.155517578125, "learning_rate": 8.181077063137733e-06, "loss": 0.0361, "reward": 1.97265625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1506 }, { "clip_ratio": 0.0, "completion_length": 758.1015625, "epoch": 0.6028, "grad_norm": 0.27540046268423957, "kl": 0.158203125, "learning_rate": 8.167349124864406e-06, "loss": 0.0624, "reward": 2.05078125, "reward_std": 0.3442452847957611, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1507 }, { "clip_ratio": 0.0, "completion_length": 756.6171875, "epoch": 0.6032, "grad_norm": 0.15659919134122963, "kl": 0.154541015625, "learning_rate": 8.153624759437733e-06, "loss": 0.0442, "reward": 1.93359375, "reward_std": 0.27014635503292084, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1508 }, { "clip_ratio": 0.0, "completion_length": 715.296875, "epoch": 0.6036, "grad_norm": 0.42300474938690935, "kl": 0.1552734375, "learning_rate": 8.139903993614069e-06, "loss": 0.0824, "reward": 2.01953125, "reward_std": 0.37855809181928635, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1509 }, { "clip_ratio": 0.0, "completion_length": 685.6640625, "epoch": 0.604, "grad_norm": 0.2690832635506855, "kl": 0.16162109375, "learning_rate": 8.126186854142752e-06, "loss": 0.066, "reward": 2.22265625, "reward_std": 0.3969612866640091, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1510 }, { "clip_ratio": 0.0, "completion_length": 727.421875, "epoch": 0.6044, "grad_norm": 0.5901119414592784, "kl": 0.152587890625, "learning_rate": 8.112473367766051e-06, "loss": 0.0783, "reward": 1.8984375, "reward_std": 0.3992316722869873, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1511 }, { "clip_ratio": 0.0, "completion_length": 683.3984375, "epoch": 0.6048, "grad_norm": 0.15186056770806464, "kl": 0.14013671875, "learning_rate": 8.098763561219101e-06, "loss": 0.0698, "reward": 2.111328125, "reward_std": 0.29760903865098953, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 1512 }, { "clip_ratio": 0.0, "completion_length": 688.90625, "epoch": 0.6052, "grad_norm": 0.7944915219268159, "kl": 0.16845703125, "learning_rate": 8.08505746122987e-06, "loss": 0.1044, "reward": 2.15234375, "reward_std": 0.27701298892498016, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1513 }, { "clip_ratio": 0.0, "completion_length": 704.9140625, "epoch": 0.6056, "grad_norm": 0.6520438570994064, "kl": 0.14892578125, "learning_rate": 8.07135509451911e-06, "loss": 0.0943, "reward": 1.900390625, "reward_std": 0.44645364582538605, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 1514 }, { "clip_ratio": 0.0, "completion_length": 751.2578125, "epoch": 0.606, "grad_norm": 0.5608784124258157, "kl": 0.152099609375, "learning_rate": 8.057656487800283e-06, "loss": 0.1128, "reward": 2.013671875, "reward_std": 0.4951820969581604, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 1515 }, { "clip_ratio": 0.0, "completion_length": 707.6953125, "epoch": 0.6064, "grad_norm": 5.335009760684327, "kl": 0.17578125, "learning_rate": 8.04396166777952e-06, "loss": 0.0752, "reward": 2.072265625, "reward_std": 0.3822624608874321, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1516 }, { "clip_ratio": 0.0, "completion_length": 713.546875, "epoch": 0.6068, "grad_norm": 0.3223348337986434, "kl": 0.1376953125, "learning_rate": 8.030270661155575e-06, "loss": 0.0937, "reward": 2.0859375, "reward_std": 0.40911657363176346, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1517 }, { "clip_ratio": 0.0, "completion_length": 711.921875, "epoch": 0.6072, "grad_norm": 0.32241816088436914, "kl": 0.1429443359375, "learning_rate": 8.016583494619769e-06, "loss": 0.0936, "reward": 1.943359375, "reward_std": 0.38177067786455154, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1518 }, { "clip_ratio": 0.0, "completion_length": 753.8203125, "epoch": 0.6076, "grad_norm": 0.26076789238507425, "kl": 0.15869140625, "learning_rate": 8.00290019485593e-06, "loss": 0.0609, "reward": 1.990234375, "reward_std": 0.433628648519516, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.943359375, "step": 1519 }, { "clip_ratio": 0.0, "completion_length": 803.1640625, "epoch": 0.608, "grad_norm": 0.4785604246953857, "kl": 0.1513671875, "learning_rate": 7.989220788540356e-06, "loss": 0.0589, "reward": 1.90625, "reward_std": 0.5613046437501907, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.9140625, "step": 1520 }, { "clip_ratio": 0.0, "completion_length": 769.0625, "epoch": 0.6084, "grad_norm": 0.2680977774023322, "kl": 0.14892578125, "learning_rate": 7.975545302341743e-06, "loss": 0.1003, "reward": 1.83984375, "reward_std": 0.4992622509598732, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.92578125, "step": 1521 }, { "clip_ratio": 0.0, "completion_length": 748.6953125, "epoch": 0.6088, "grad_norm": 0.244150187788957, "kl": 0.14599609375, "learning_rate": 7.961873762921153e-06, "loss": 0.0787, "reward": 1.958984375, "reward_std": 0.4494116082787514, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1522 }, { "clip_ratio": 0.0, "completion_length": 677.6796875, "epoch": 0.6092, "grad_norm": 0.1951640275468393, "kl": 0.158203125, "learning_rate": 7.948206196931953e-06, "loss": 0.0572, "reward": 1.9765625, "reward_std": 0.2347758263349533, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1523 }, { "clip_ratio": 0.0, "completion_length": 645.359375, "epoch": 0.6096, "grad_norm": 0.7641118498004924, "kl": 0.174560546875, "learning_rate": 7.934542631019767e-06, "loss": 0.0446, "reward": 2.05859375, "reward_std": 0.22431383281946182, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1524 }, { "clip_ratio": 0.0, "completion_length": 644.015625, "epoch": 0.61, "grad_norm": 0.242798750012628, "kl": 0.13623046875, "learning_rate": 7.92088309182241e-06, "loss": 0.0733, "reward": 2.0078125, "reward_std": 0.32394562661647797, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1525 }, { "clip_ratio": 0.0, "completion_length": 765.1796875, "epoch": 0.6104, "grad_norm": 0.23250409777173367, "kl": 0.14599609375, "learning_rate": 7.907227605969849e-06, "loss": 0.0547, "reward": 1.939453125, "reward_std": 0.40628792345523834, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 1526 }, { "clip_ratio": 0.0, "completion_length": 801.6328125, "epoch": 0.6108, "grad_norm": 0.33770908731596977, "kl": 0.1312255859375, "learning_rate": 7.89357620008416e-06, "loss": 0.0217, "reward": 2.03515625, "reward_std": 0.28677815943956375, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1527 }, { "clip_ratio": 0.0, "completion_length": 711.671875, "epoch": 0.6112, "grad_norm": 0.6079827273140691, "kl": 0.147216796875, "learning_rate": 7.879928900779457e-06, "loss": 0.0772, "reward": 2.142578125, "reward_std": 0.4365089163184166, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1528 }, { "clip_ratio": 0.0, "completion_length": 776.5078125, "epoch": 0.6116, "grad_norm": 0.3509554785545465, "kl": 0.154052734375, "learning_rate": 7.866285734661842e-06, "loss": 0.051, "reward": 2.08984375, "reward_std": 0.35495954006910324, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1529 }, { "clip_ratio": 0.0, "completion_length": 785.21875, "epoch": 0.612, "grad_norm": 0.2732154187627086, "kl": 0.14697265625, "learning_rate": 7.852646728329368e-06, "loss": 0.0262, "reward": 2.021484375, "reward_std": 0.3822467252612114, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1530 }, { "clip_ratio": 0.0, "completion_length": 717.953125, "epoch": 0.6124, "grad_norm": 0.25938272565370885, "kl": 0.151611328125, "learning_rate": 7.83901190837198e-06, "loss": 0.0343, "reward": 2.126953125, "reward_std": 0.32507117092609406, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1531 }, { "clip_ratio": 0.0, "completion_length": 749.9765625, "epoch": 0.6128, "grad_norm": 0.43802178153500126, "kl": 0.16259765625, "learning_rate": 7.825381301371452e-06, "loss": 0.0297, "reward": 1.962890625, "reward_std": 0.27919505536556244, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1532 }, { "clip_ratio": 0.0, "completion_length": 747.015625, "epoch": 0.6132, "grad_norm": 0.2918890406957349, "kl": 0.1409912109375, "learning_rate": 7.811754933901358e-06, "loss": 0.0721, "reward": 1.921875, "reward_std": 0.29666510224342346, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96875, "step": 1533 }, { "clip_ratio": 0.0, "completion_length": 802.859375, "epoch": 0.6136, "grad_norm": 0.1675650544610334, "kl": 0.140625, "learning_rate": 7.798132832526986e-06, "loss": 0.0429, "reward": 1.955078125, "reward_std": 0.2545113116502762, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1534 }, { "clip_ratio": 0.0, "completion_length": 800.171875, "epoch": 0.614, "grad_norm": 0.22902331617086316, "kl": 0.142333984375, "learning_rate": 7.784515023805328e-06, "loss": 0.058, "reward": 1.91015625, "reward_std": 0.4583984240889549, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 1535 }, { "clip_ratio": 0.0, "completion_length": 778.53125, "epoch": 0.6144, "grad_norm": 0.23065405747826653, "kl": 0.1373291015625, "learning_rate": 7.770901534284996e-06, "loss": 0.0614, "reward": 2.095703125, "reward_std": 0.3501944839954376, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1536 }, { "clip_ratio": 0.0, "completion_length": 756.7109375, "epoch": 0.6148, "grad_norm": 0.3237557324525253, "kl": 0.151123046875, "learning_rate": 7.757292390506191e-06, "loss": 0.0542, "reward": 1.990234375, "reward_std": 0.3898453786969185, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1537 }, { "clip_ratio": 0.0, "completion_length": 737.3359375, "epoch": 0.6152, "grad_norm": 0.34230535355343783, "kl": 0.129150390625, "learning_rate": 7.743687619000625e-06, "loss": 0.0565, "reward": 2.0078125, "reward_std": 0.3618512749671936, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1538 }, { "clip_ratio": 0.0, "completion_length": 648.390625, "epoch": 0.6156, "grad_norm": 0.5669607951566432, "kl": 0.205322265625, "learning_rate": 7.730087246291503e-06, "loss": 0.1841, "reward": 2.21484375, "reward_std": 0.48040297627449036, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1539 }, { "clip_ratio": 0.0, "completion_length": 727.0078125, "epoch": 0.616, "grad_norm": 0.591346719286379, "kl": 0.1575927734375, "learning_rate": 7.716491298893443e-06, "loss": 0.0701, "reward": 1.92578125, "reward_std": 0.3112270012497902, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1540 }, { "clip_ratio": 0.0, "completion_length": 650.8515625, "epoch": 0.6164, "grad_norm": 0.2518382641562244, "kl": 0.1494140625, "learning_rate": 7.702899803312443e-06, "loss": 0.0314, "reward": 2.08203125, "reward_std": 0.20101578533649445, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1541 }, { "clip_ratio": 0.0, "completion_length": 723.4609375, "epoch": 0.6168, "grad_norm": 0.29186928425674263, "kl": 0.1693115234375, "learning_rate": 7.689312786045823e-06, "loss": 0.0601, "reward": 1.9453125, "reward_std": 0.37567031383514404, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1542 }, { "clip_ratio": 0.0, "completion_length": 717.9453125, "epoch": 0.6172, "grad_norm": 0.19334797553062985, "kl": 0.14794921875, "learning_rate": 7.67573027358216e-06, "loss": 0.0155, "reward": 2.158203125, "reward_std": 0.1542295292019844, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1543 }, { "clip_ratio": 0.0, "completion_length": 671.484375, "epoch": 0.6176, "grad_norm": 2.965136533495464, "kl": 0.1361083984375, "learning_rate": 7.662152292401265e-06, "loss": 0.0616, "reward": 2.083984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1544 }, { "clip_ratio": 0.0, "completion_length": 757.34375, "epoch": 0.618, "grad_norm": 0.18308802059669974, "kl": 0.148193359375, "learning_rate": 7.6485788689741e-06, "loss": 0.0329, "reward": 1.9609375, "reward_std": 0.15625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.984375, "step": 1545 }, { "clip_ratio": 0.0, "completion_length": 739.9375, "epoch": 0.6184, "grad_norm": 0.18985741768870626, "kl": 0.156982421875, "learning_rate": 7.635010029762755e-06, "loss": 0.0096, "reward": 2.154296875, "reward_std": 0.09890169650316238, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1546 }, { "clip_ratio": 0.0, "completion_length": 743.46875, "epoch": 0.6188, "grad_norm": 0.16212147185188194, "kl": 0.13134765625, "learning_rate": 7.621445801220372e-06, "loss": 0.0163, "reward": 2.046875, "reward_std": 0.12433473765850067, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1547 }, { "clip_ratio": 0.0, "completion_length": 712.3671875, "epoch": 0.6192, "grad_norm": 0.40062260702150376, "kl": 0.150146484375, "learning_rate": 7.6078862097911075e-06, "loss": 0.0306, "reward": 2.109375, "reward_std": 0.21608919650316238, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1548 }, { "clip_ratio": 0.0, "completion_length": 695.8515625, "epoch": 0.6196, "grad_norm": 0.32875973539789055, "kl": 0.152099609375, "learning_rate": 7.594331281910082e-06, "loss": 0.0682, "reward": 1.9140625, "reward_std": 0.2185046672821045, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1549 }, { "clip_ratio": 0.0, "completion_length": 699.4375, "epoch": 0.62, "grad_norm": 0.0978975244677398, "kl": 0.13818359375, "learning_rate": 7.580781044003324e-06, "loss": 0.0105, "reward": 2.0390625, "reward_std": 0.059839196503162384, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1550 }, { "clip_ratio": 0.0, "completion_length": 768.515625, "epoch": 0.6204, "grad_norm": 0.26294669921924985, "kl": 0.151611328125, "learning_rate": 7.5672355224877115e-06, "loss": 0.0154, "reward": 2.109375, "reward_std": 0.27928202599287033, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1551 }, { "clip_ratio": 0.0, "completion_length": 710.9375, "epoch": 0.6208, "grad_norm": 0.18073222164704938, "kl": 0.150390625, "learning_rate": 7.553694743770928e-06, "loss": 0.0075, "reward": 2.189453125, "reward_std": 0.1605696976184845, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1552 }, { "clip_ratio": 0.0, "completion_length": 565.890625, "epoch": 0.6212, "grad_norm": 0.26594844486486885, "kl": 0.15478515625, "learning_rate": 7.54015873425142e-06, "loss": 0.0308, "reward": 2.3515625, "reward_std": 0.1128891110420227, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1553 }, { "clip_ratio": 0.0, "completion_length": 681.359375, "epoch": 0.6216, "grad_norm": 0.28561422726057534, "kl": 0.1650390625, "learning_rate": 7.526627520318329e-06, "loss": 0.0077, "reward": 2.109375, "reward_std": 0.24606424570083618, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1554 }, { "clip_ratio": 0.0, "completion_length": 640.3125, "epoch": 0.622, "grad_norm": 0.2770373452224835, "kl": 0.1640625, "learning_rate": 7.513101128351454e-06, "loss": 0.0224, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1555 }, { "clip_ratio": 0.0, "completion_length": 725.875, "epoch": 0.6224, "grad_norm": 0.1992255931110893, "kl": 0.1533203125, "learning_rate": 7.49957958472118e-06, "loss": 0.0127, "reward": 2.001953125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1556 }, { "clip_ratio": 0.0, "completion_length": 716.5078125, "epoch": 0.6228, "grad_norm": 0.1147232897198073, "kl": 0.136962890625, "learning_rate": 7.486062915788453e-06, "loss": 0.0119, "reward": 2.009765625, "reward_std": 0.07882498949766159, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1557 }, { "clip_ratio": 0.0, "completion_length": 706.9375, "epoch": 0.6232, "grad_norm": 0.14438248963722827, "kl": 0.1669921875, "learning_rate": 7.472551147904708e-06, "loss": 0.0148, "reward": 2.0390625, "reward_std": 0.08715169876813889, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1558 }, { "clip_ratio": 0.0, "completion_length": 794.078125, "epoch": 0.6236, "grad_norm": 0.1875343662912424, "kl": 0.133544921875, "learning_rate": 7.4590443074118325e-06, "loss": 0.0234, "reward": 2.052734375, "reward_std": 0.13435593992471695, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1559 }, { "clip_ratio": 0.0, "completion_length": 736.7890625, "epoch": 0.624, "grad_norm": 0.22347082876669835, "kl": 0.1455078125, "learning_rate": 7.445542420642097e-06, "loss": 0.0239, "reward": 2.03515625, "reward_std": 0.26210860908031464, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1560 }, { "clip_ratio": 0.0, "completion_length": 735.8515625, "epoch": 0.6244, "grad_norm": 0.2680545386403358, "kl": 0.160888671875, "learning_rate": 7.432045513918122e-06, "loss": 0.0225, "reward": 2.189453125, "reward_std": 0.20752985030412674, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1561 }, { "clip_ratio": 0.0, "completion_length": 708.2421875, "epoch": 0.6248, "grad_norm": 0.16310346241133236, "kl": 0.154541015625, "learning_rate": 7.418553613552824e-06, "loss": 0.0149, "reward": 2.201171875, "reward_std": 0.1320272907614708, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1562 }, { "clip_ratio": 0.0, "completion_length": 641.3515625, "epoch": 0.6252, "grad_norm": 0.14249873180851547, "kl": 0.145751953125, "learning_rate": 7.405066745849347e-06, "loss": 0.0265, "reward": 2.0078125, "reward_std": 0.13224754482507706, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1563 }, { "clip_ratio": 0.0, "completion_length": 739.625, "epoch": 0.6256, "grad_norm": 0.12222262303090299, "kl": 0.162353515625, "learning_rate": 7.391584937101034e-06, "loss": 0.0118, "reward": 1.986328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1564 }, { "clip_ratio": 0.0, "completion_length": 744.4453125, "epoch": 0.626, "grad_norm": 0.1810323457791237, "kl": 0.162353515625, "learning_rate": 7.378108213591355e-06, "loss": 0.0166, "reward": 2.02734375, "reward_std": 0.14216843247413635, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 1565 }, { "clip_ratio": 0.0, "completion_length": 677.3203125, "epoch": 0.6264, "grad_norm": 0.21155708771944629, "kl": 0.153564453125, "learning_rate": 7.364636601593875e-06, "loss": 0.0381, "reward": 2.04296875, "reward_std": 0.1960124894976616, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1566 }, { "clip_ratio": 0.0, "completion_length": 737.7578125, "epoch": 0.6268, "grad_norm": 0.2622954892311983, "kl": 0.1572265625, "learning_rate": 7.351170127372191e-06, "loss": 0.0471, "reward": 1.97265625, "reward_std": 0.1886470764875412, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1567 }, { "clip_ratio": 0.0, "completion_length": 668.25, "epoch": 0.6272, "grad_norm": 0.774228952196342, "kl": 0.165771484375, "learning_rate": 7.33770881717989e-06, "loss": 0.0313, "reward": 2.232421875, "reward_std": 0.13857005536556244, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1568 }, { "clip_ratio": 0.0, "completion_length": 680.1796875, "epoch": 0.6276, "grad_norm": 0.24998199090868822, "kl": 0.1617431640625, "learning_rate": 7.324252697260475e-06, "loss": 0.0542, "reward": 2.302734375, "reward_std": 0.30299198627471924, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1569 }, { "clip_ratio": 0.0, "completion_length": 723.3359375, "epoch": 0.628, "grad_norm": 1.8853014034967353, "kl": 0.17041015625, "learning_rate": 7.310801793847344e-06, "loss": 0.0464, "reward": 2.216796875, "reward_std": 0.2765265107154846, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 1570 }, { "clip_ratio": 0.0, "completion_length": 729.015625, "epoch": 0.6284, "grad_norm": 0.18384211067488582, "kl": 0.17724609375, "learning_rate": 7.297356133163722e-06, "loss": 0.0293, "reward": 2.00390625, "reward_std": 0.16527669876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1571 }, { "clip_ratio": 0.0, "completion_length": 733.8671875, "epoch": 0.6288, "grad_norm": 0.18748426151575345, "kl": 0.150634765625, "learning_rate": 7.283915741422611e-06, "loss": 0.0129, "reward": 2.0625, "reward_std": 0.17737750709056854, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1572 }, { "clip_ratio": 0.0, "completion_length": 716.8203125, "epoch": 0.6292, "grad_norm": 0.23404430066599036, "kl": 0.166259765625, "learning_rate": 7.27048064482675e-06, "loss": 0.0346, "reward": 2.11328125, "reward_std": 0.2535141110420227, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1573 }, { "clip_ratio": 0.0, "completion_length": 728.6953125, "epoch": 0.6296, "grad_norm": 0.2600224585886482, "kl": 0.164794921875, "learning_rate": 7.257050869568536e-06, "loss": 0.0311, "reward": 2.009765625, "reward_std": 0.2109375, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1574 }, { "clip_ratio": 0.0, "completion_length": 739.0078125, "epoch": 0.63, "grad_norm": 0.47204998965215583, "kl": 0.156005859375, "learning_rate": 7.243626441830009e-06, "loss": 0.0369, "reward": 2.08984375, "reward_std": 0.2847076281905174, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1575 }, { "clip_ratio": 0.0, "completion_length": 801.4609375, "epoch": 0.6304, "grad_norm": 0.25473145278157405, "kl": 0.14892578125, "learning_rate": 7.2302073877827775e-06, "loss": 0.0087, "reward": 2.177734375, "reward_std": 0.3895202800631523, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1576 }, { "clip_ratio": 0.0, "completion_length": 803.5390625, "epoch": 0.6308, "grad_norm": 1.1604687300323875, "kl": 0.138427734375, "learning_rate": 7.216793733587976e-06, "loss": 0.0531, "reward": 1.93359375, "reward_std": 0.31137026846408844, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1577 }, { "clip_ratio": 0.0, "completion_length": 707.6640625, "epoch": 0.6312, "grad_norm": 0.24966488246675342, "kl": 0.1558837890625, "learning_rate": 7.203385505396203e-06, "loss": 0.0562, "reward": 2.115234375, "reward_std": 0.22436301410198212, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.966796875, "step": 1578 }, { "clip_ratio": 0.0, "completion_length": 738.375, "epoch": 0.6316, "grad_norm": 0.46817191918840473, "kl": 0.162353515625, "learning_rate": 7.189982729347491e-06, "loss": 0.07, "reward": 1.921875, "reward_std": 0.27784235030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1579 }, { "clip_ratio": 0.0, "completion_length": 687.078125, "epoch": 0.632, "grad_norm": 0.29068404568813766, "kl": 0.15576171875, "learning_rate": 7.176585431571235e-06, "loss": 0.0695, "reward": 2.04296875, "reward_std": 0.3595879077911377, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1580 }, { "clip_ratio": 0.0, "completion_length": 798.734375, "epoch": 0.6324, "grad_norm": 0.30118009794829537, "kl": 0.1405029296875, "learning_rate": 7.163193638186159e-06, "loss": 0.0773, "reward": 2.0078125, "reward_std": 0.45377640426158905, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9609375, "step": 1581 }, { "clip_ratio": 0.0, "completion_length": 774.265625, "epoch": 0.6328, "grad_norm": 0.2367023681321455, "kl": 0.169189453125, "learning_rate": 7.149807375300239e-06, "loss": 0.0498, "reward": 2.14453125, "reward_std": 0.3827263042330742, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 1582 }, { "clip_ratio": 0.0, "completion_length": 788.2109375, "epoch": 0.6332, "grad_norm": 0.158446486557738, "kl": 0.16015625, "learning_rate": 7.13642666901069e-06, "loss": 0.0261, "reward": 1.94921875, "reward_std": 0.2197130173444748, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1583 }, { "clip_ratio": 0.0, "completion_length": 797.375, "epoch": 0.6336, "grad_norm": 0.32016331958469285, "kl": 0.179443359375, "learning_rate": 7.123051545403874e-06, "loss": 0.0224, "reward": 2.16796875, "reward_std": 0.30755671858787537, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1584 }, { "clip_ratio": 0.0, "completion_length": 789.4765625, "epoch": 0.634, "grad_norm": 1.3169993799512192, "kl": 0.167236328125, "learning_rate": 7.109682030555283e-06, "loss": 0.0315, "reward": 2.16796875, "reward_std": 0.3386722207069397, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1585 }, { "clip_ratio": 0.0, "completion_length": 742.546875, "epoch": 0.6344, "grad_norm": 0.2758058337836864, "kl": 0.1463623046875, "learning_rate": 7.096318150529476e-06, "loss": 0.0355, "reward": 2.20703125, "reward_std": 0.3128039762377739, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.98046875, "step": 1586 }, { "clip_ratio": 0.0, "completion_length": 742.2265625, "epoch": 0.6348, "grad_norm": 0.26082804484287603, "kl": 0.182373046875, "learning_rate": 7.082959931380011e-06, "loss": 0.0623, "reward": 2.05078125, "reward_std": 0.26221734285354614, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1587 }, { "clip_ratio": 0.0, "completion_length": 774.171875, "epoch": 0.6352, "grad_norm": 0.1649409254690828, "kl": 0.13623046875, "learning_rate": 7.069607399149427e-06, "loss": 0.0444, "reward": 2.06640625, "reward_std": 0.23735368251800537, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1588 }, { "clip_ratio": 0.0, "completion_length": 793.2890625, "epoch": 0.6356, "grad_norm": 0.22283608000838392, "kl": 0.15771484375, "learning_rate": 7.056260579869165e-06, "loss": 0.0274, "reward": 1.998046875, "reward_std": 0.18924404680728912, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1589 }, { "clip_ratio": 0.0, "completion_length": 813.984375, "epoch": 0.636, "grad_norm": 0.22675679481811148, "kl": 0.1396484375, "learning_rate": 7.042919499559538e-06, "loss": 0.0157, "reward": 1.9921875, "reward_std": 0.2675572782754898, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.984375, "step": 1590 }, { "clip_ratio": 0.0, "completion_length": 723.9921875, "epoch": 0.6364, "grad_norm": 0.289187154320887, "kl": 0.149169921875, "learning_rate": 7.029584184229653e-06, "loss": 0.0141, "reward": 2.142578125, "reward_std": 0.2613266110420227, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1591 }, { "clip_ratio": 0.0, "completion_length": 724.3203125, "epoch": 0.6368, "grad_norm": 0.27144231524048734, "kl": 0.1630859375, "learning_rate": 7.016254659877398e-06, "loss": 0.0399, "reward": 1.982421875, "reward_std": 0.30479395389556885, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1592 }, { "clip_ratio": 0.0, "completion_length": 724.4609375, "epoch": 0.6372, "grad_norm": 0.227095535552844, "kl": 0.14990234375, "learning_rate": 7.002930952489362e-06, "loss": 0.0424, "reward": 2.189453125, "reward_std": 0.22227438539266586, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1593 }, { "clip_ratio": 0.0, "completion_length": 751.4921875, "epoch": 0.6376, "grad_norm": 0.21352562811947773, "kl": 0.141845703125, "learning_rate": 6.9896130880407965e-06, "loss": 0.0242, "reward": 2.16015625, "reward_std": 0.21457062661647797, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1594 }, { "clip_ratio": 0.0, "completion_length": 637.8046875, "epoch": 0.638, "grad_norm": 0.40482199862072304, "kl": 0.169677734375, "learning_rate": 6.976301092495556e-06, "loss": 0.107, "reward": 2.208984375, "reward_std": 0.3218516558408737, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1595 }, { "clip_ratio": 0.0, "completion_length": 782.1328125, "epoch": 0.6384, "grad_norm": 0.28902778235787063, "kl": 0.144287109375, "learning_rate": 6.962994991806059e-06, "loss": 0.0362, "reward": 1.9921875, "reward_std": 0.13508247584104538, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1596 }, { "clip_ratio": 0.0, "completion_length": 675.0390625, "epoch": 0.6388, "grad_norm": 8519.128316473792, "kl": 1.572021484375, "learning_rate": 6.949694811913226e-06, "loss": 0.1318, "reward": 2.033203125, "reward_std": 0.3158697113394737, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1597 }, { "clip_ratio": 0.0, "completion_length": 677.734375, "epoch": 0.6392, "grad_norm": 0.28290094837751834, "kl": 0.15625, "learning_rate": 6.9364005787464406e-06, "loss": 0.0404, "reward": 2.107421875, "reward_std": 0.287841372191906, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1598 }, { "clip_ratio": 0.0, "completion_length": 772.21875, "epoch": 0.6396, "grad_norm": 0.2273492010495088, "kl": 0.14990234375, "learning_rate": 6.923112318223497e-06, "loss": 0.0388, "reward": 2.1796875, "reward_std": 0.3313138484954834, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1599 }, { "clip_ratio": 0.0, "completion_length": 811.9296875, "epoch": 0.64, "grad_norm": 1.0856918081937992, "kl": 0.22998046875, "learning_rate": 6.909830056250527e-06, "loss": 0.0442, "reward": 2.103515625, "reward_std": 0.35411009192466736, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1600 }, { "clip_ratio": 0.0, "completion_length": 724.2734375, "epoch": 0.6404, "grad_norm": 0.2645159490176291, "kl": 0.177734375, "learning_rate": 6.896553818721989e-06, "loss": 0.0527, "reward": 1.98046875, "reward_std": 0.2646464630961418, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1601 }, { "clip_ratio": 0.0, "completion_length": 748.390625, "epoch": 0.6408, "grad_norm": 0.2299814292424411, "kl": 0.1552734375, "learning_rate": 6.883283631520582e-06, "loss": 0.0547, "reward": 1.98828125, "reward_std": 0.23247353732585907, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1602 }, { "clip_ratio": 0.0, "completion_length": 686.609375, "epoch": 0.6412, "grad_norm": 0.16470122230368095, "kl": 0.1522216796875, "learning_rate": 6.870019520517217e-06, "loss": 0.0247, "reward": 2.0859375, "reward_std": 0.15940608829259872, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1603 }, { "clip_ratio": 0.0, "completion_length": 667.8046875, "epoch": 0.6416, "grad_norm": 0.29188058502791686, "kl": 0.181396484375, "learning_rate": 6.856761511570963e-06, "loss": 0.0234, "reward": 1.98046875, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1604 }, { "clip_ratio": 0.0, "completion_length": 737.359375, "epoch": 0.642, "grad_norm": 0.29322433599251896, "kl": 0.160400390625, "learning_rate": 6.843509630528977e-06, "loss": 0.0401, "reward": 2.0625, "reward_std": 0.2253800332546234, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1605 }, { "clip_ratio": 0.0, "completion_length": 670.1953125, "epoch": 0.6424, "grad_norm": 0.24855087422305466, "kl": 0.1640625, "learning_rate": 6.830263903226483e-06, "loss": 0.0214, "reward": 2.15625, "reward_std": 0.17149211466312408, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1606 }, { "clip_ratio": 0.0, "completion_length": 779.3046875, "epoch": 0.6428, "grad_norm": 0.27329832145307575, "kl": 0.1343994140625, "learning_rate": 6.8170243554867065e-06, "loss": -0.0009, "reward": 2.0859375, "reward_std": 0.15092839300632477, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1607 }, { "clip_ratio": 0.0, "completion_length": 797.34375, "epoch": 0.6432, "grad_norm": 0.15230752547256593, "kl": 0.1356201171875, "learning_rate": 6.803791013120822e-06, "loss": 0.0255, "reward": 1.98046875, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1608 }, { "clip_ratio": 0.0, "completion_length": 776.90625, "epoch": 0.6436, "grad_norm": 0.3927311904437777, "kl": 0.168212890625, "learning_rate": 6.790563901927907e-06, "loss": 0.0348, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1609 }, { "clip_ratio": 0.0, "completion_length": 698.234375, "epoch": 0.644, "grad_norm": 0.15831638827637298, "kl": 0.164794921875, "learning_rate": 6.777343047694891e-06, "loss": 0.0309, "reward": 1.994140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1610 }, { "clip_ratio": 0.0, "completion_length": 726.8515625, "epoch": 0.6444, "grad_norm": 0.12228289461794273, "kl": 0.1533203125, "learning_rate": 6.764128476196505e-06, "loss": 0.0236, "reward": 1.9765625, "reward_std": 0.09375, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1611 }, { "clip_ratio": 0.0, "completion_length": 793.6953125, "epoch": 0.6448, "grad_norm": 0.27719157513263964, "kl": 0.13330078125, "learning_rate": 6.750920213195238e-06, "loss": 0.032, "reward": 2.029296875, "reward_std": 0.3176834285259247, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1612 }, { "clip_ratio": 0.0, "completion_length": 718.9140625, "epoch": 0.6452, "grad_norm": 0.17319904060038452, "kl": 0.1353759765625, "learning_rate": 6.737718284441267e-06, "loss": 0.0165, "reward": 2.12890625, "reward_std": 0.13061904907226562, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1613 }, { "clip_ratio": 0.0, "completion_length": 775.6171875, "epoch": 0.6456, "grad_norm": 0.2094752638605604, "kl": 0.171142578125, "learning_rate": 6.7245227156724324e-06, "loss": 0.0397, "reward": 2.056640625, "reward_std": 0.20412220060825348, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1614 }, { "clip_ratio": 0.0, "completion_length": 761.203125, "epoch": 0.646, "grad_norm": 0.17536945355563804, "kl": 0.15478515625, "learning_rate": 6.711333532614168e-06, "loss": 0.0348, "reward": 2.14453125, "reward_std": 0.171875, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1615 }, { "clip_ratio": 0.0, "completion_length": 692.171875, "epoch": 0.6464, "grad_norm": 0.8167299453138958, "kl": 0.139892578125, "learning_rate": 6.698150760979463e-06, "loss": 0.0434, "reward": 2.064453125, "reward_std": 0.27273958921432495, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1616 }, { "clip_ratio": 0.0, "completion_length": 708.9765625, "epoch": 0.6468, "grad_norm": 0.2604926273527964, "kl": 0.148681640625, "learning_rate": 6.684974426468809e-06, "loss": 0.0477, "reward": 2.025390625, "reward_std": 0.27404387295246124, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1617 }, { "clip_ratio": 0.0, "completion_length": 799.46875, "epoch": 0.6472, "grad_norm": 0.16012885652063472, "kl": 0.157958984375, "learning_rate": 6.671804554770135e-06, "loss": 0.0236, "reward": 1.998046875, "reward_std": 0.19718602299690247, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1618 }, { "clip_ratio": 0.0, "completion_length": 756.0078125, "epoch": 0.6476, "grad_norm": 0.2861952361678664, "kl": 0.159912109375, "learning_rate": 6.658641171558785e-06, "loss": 0.0291, "reward": 2.22265625, "reward_std": 0.24930480122566223, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1619 }, { "clip_ratio": 0.0, "completion_length": 637.3203125, "epoch": 0.648, "grad_norm": 0.27668279137012125, "kl": 0.16796875, "learning_rate": 6.645484302497452e-06, "loss": 0.0928, "reward": 2.09375, "reward_std": 0.31489915400743484, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1620 }, { "clip_ratio": 0.0, "completion_length": 706.2265625, "epoch": 0.6484, "grad_norm": 0.2552634588361414, "kl": 0.151611328125, "learning_rate": 6.63233397323612e-06, "loss": 0.0529, "reward": 2.001953125, "reward_std": 0.23558919876813889, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1621 }, { "clip_ratio": 0.0, "completion_length": 698.5859375, "epoch": 0.6488, "grad_norm": 3.9863861074953286, "kl": 0.167236328125, "learning_rate": 6.6191902094120295e-06, "loss": 0.0436, "reward": 2.142578125, "reward_std": 0.27002984285354614, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1622 }, { "clip_ratio": 0.0, "completion_length": 682.875, "epoch": 0.6492, "grad_norm": 0.216375562916526, "kl": 0.157470703125, "learning_rate": 6.60605303664962e-06, "loss": 0.0021, "reward": 2.09375, "reward_std": 0.120451420545578, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1623 }, { "clip_ratio": 0.0, "completion_length": 750.0859375, "epoch": 0.6496, "grad_norm": 0.27508313290350905, "kl": 0.1351318359375, "learning_rate": 6.5929224805604845e-06, "loss": 0.0316, "reward": 2.009765625, "reward_std": 0.29909127950668335, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1624 }, { "clip_ratio": 0.0, "completion_length": 668.984375, "epoch": 0.65, "grad_norm": 0.48617799091033864, "kl": 0.1591796875, "learning_rate": 6.579798566743314e-06, "loss": 0.0362, "reward": 2.060546875, "reward_std": 0.2874041050672531, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1625 }, { "clip_ratio": 0.0, "completion_length": 748.6953125, "epoch": 0.6504, "grad_norm": 0.4513715972314291, "kl": 0.159423828125, "learning_rate": 6.566681320783849e-06, "loss": 0.0797, "reward": 1.904296875, "reward_std": 0.30994678288698196, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 1626 }, { "clip_ratio": 0.0, "completion_length": 761.515625, "epoch": 0.6508, "grad_norm": 0.1485287897831186, "kl": 0.1494140625, "learning_rate": 6.553570768254831e-06, "loss": 0.0208, "reward": 1.916015625, "reward_std": 0.1504981815814972, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1627 }, { "clip_ratio": 0.0, "completion_length": 707.390625, "epoch": 0.6512, "grad_norm": 0.31383197810268415, "kl": 0.136962890625, "learning_rate": 6.540466934715953e-06, "loss": 0.019, "reward": 2.064453125, "reward_std": 0.3159921169281006, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1628 }, { "clip_ratio": 0.0, "completion_length": 673.5390625, "epoch": 0.6516, "grad_norm": 0.22094447551115817, "kl": 0.150634765625, "learning_rate": 6.52736984571381e-06, "loss": 0.0651, "reward": 1.943359375, "reward_std": 0.3119630664587021, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1629 }, { "clip_ratio": 0.0, "completion_length": 794.9765625, "epoch": 0.652, "grad_norm": 0.3152378770333232, "kl": 0.154052734375, "learning_rate": 6.5142795267818505e-06, "loss": 0.0836, "reward": 1.916015625, "reward_std": 0.45599570870399475, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1630 }, { "clip_ratio": 0.0, "completion_length": 751.640625, "epoch": 0.6524, "grad_norm": 0.47924076019796463, "kl": 0.17431640625, "learning_rate": 6.501196003440313e-06, "loss": 0.1154, "reward": 1.986328125, "reward_std": 0.514088548719883, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.931640625, "step": 1631 }, { "clip_ratio": 0.0, "completion_length": 741.2578125, "epoch": 0.6528, "grad_norm": 0.23571316883380983, "kl": 0.1510009765625, "learning_rate": 6.488119301196201e-06, "loss": 0.0451, "reward": 2.189453125, "reward_std": 0.24886231869459152, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1632 }, { "clip_ratio": 0.0, "completion_length": 690.2421875, "epoch": 0.6532, "grad_norm": 0.5855114083861782, "kl": 0.1572265625, "learning_rate": 6.475049445543215e-06, "loss": 0.1083, "reward": 2.107421875, "reward_std": 0.5089182332158089, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1633 }, { "clip_ratio": 0.0, "completion_length": 703.609375, "epoch": 0.6536, "grad_norm": 2.758058935937091, "kl": 0.4384765625, "learning_rate": 6.461986461961706e-06, "loss": 0.113, "reward": 2.525390625, "reward_std": 0.45748884230852127, "rewards/accuracy_reward": 0.59375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1634 }, { "clip_ratio": 0.0, "completion_length": 827.9609375, "epoch": 0.654, "grad_norm": 0.7712932161321725, "kl": 0.16162109375, "learning_rate": 6.448930375918632e-06, "loss": 0.0533, "reward": 1.962890625, "reward_std": 0.37108267843723297, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.939453125, "step": 1635 }, { "clip_ratio": 0.0, "completion_length": 767.71875, "epoch": 0.6544, "grad_norm": 0.33654970129162665, "kl": 0.177978515625, "learning_rate": 6.435881212867494e-06, "loss": 0.0648, "reward": 2.05859375, "reward_std": 0.3076309338212013, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1636 }, { "clip_ratio": 0.0, "completion_length": 738.9296875, "epoch": 0.6548, "grad_norm": 0.39890749771536016, "kl": 0.172119140625, "learning_rate": 6.422838998248308e-06, "loss": 0.1087, "reward": 1.916015625, "reward_std": 0.510308749973774, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 1637 }, { "clip_ratio": 0.0, "completion_length": 680.40625, "epoch": 0.6552, "grad_norm": 0.8121569383225392, "kl": 0.15673828125, "learning_rate": 6.409803757487539e-06, "loss": 0.0685, "reward": 2.041015625, "reward_std": 0.29464786499738693, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1638 }, { "clip_ratio": 0.0, "completion_length": 707.265625, "epoch": 0.6556, "grad_norm": 0.5252190656787219, "kl": 0.164794921875, "learning_rate": 6.396775515998055e-06, "loss": 0.127, "reward": 1.91015625, "reward_std": 0.5127677023410797, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 1639 }, { "clip_ratio": 0.0, "completion_length": 677.7890625, "epoch": 0.656, "grad_norm": 0.41879098995887915, "kl": 0.162353515625, "learning_rate": 6.383754299179079e-06, "loss": 0.0804, "reward": 2.0390625, "reward_std": 0.33230943232774734, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1640 }, { "clip_ratio": 0.0, "completion_length": 705.875, "epoch": 0.6564, "grad_norm": 0.29181458724214054, "kl": 0.14892578125, "learning_rate": 6.370740132416138e-06, "loss": 0.0304, "reward": 1.978515625, "reward_std": 0.2774069756269455, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.978515625, "step": 1641 }, { "clip_ratio": 0.0, "completion_length": 619.9453125, "epoch": 0.6568, "grad_norm": 0.4740049587418445, "kl": 0.18115234375, "learning_rate": 6.357733041081018e-06, "loss": 0.0744, "reward": 2.05859375, "reward_std": 0.30150531977415085, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1642 }, { "clip_ratio": 0.0, "completion_length": 745.7578125, "epoch": 0.6572, "grad_norm": 0.1967283977256689, "kl": 0.15234375, "learning_rate": 6.344733050531713e-06, "loss": 0.0401, "reward": 1.998046875, "reward_std": 0.23341139405965805, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1643 }, { "clip_ratio": 0.0, "completion_length": 704.3046875, "epoch": 0.6576, "grad_norm": 0.3066782235111801, "kl": 0.1400146484375, "learning_rate": 6.33174018611236e-06, "loss": 0.027, "reward": 2.017578125, "reward_std": 0.15988312661647797, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1644 }, { "clip_ratio": 0.0, "completion_length": 696.5, "epoch": 0.658, "grad_norm": 0.4298295420150143, "kl": 0.150390625, "learning_rate": 6.318754473153221e-06, "loss": 0.0334, "reward": 2.119140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1645 }, { "clip_ratio": 0.0, "completion_length": 648.0, "epoch": 0.6584, "grad_norm": 0.5498064835117461, "kl": 0.1357421875, "learning_rate": 6.305775936970606e-06, "loss": 0.092, "reward": 2.1953125, "reward_std": 0.3777061179280281, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1646 }, { "clip_ratio": 0.0, "completion_length": 634.2421875, "epoch": 0.6588, "grad_norm": 0.3331870331740382, "kl": 0.13916015625, "learning_rate": 6.292804602866833e-06, "loss": 0.0546, "reward": 2.1796875, "reward_std": 0.34852484613657, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1647 }, { "clip_ratio": 0.0, "completion_length": 729.625, "epoch": 0.6592, "grad_norm": 1.1357309174448604, "kl": 0.18798828125, "learning_rate": 6.27984049613019e-06, "loss": 0.1211, "reward": 2.013671875, "reward_std": 0.5124112516641617, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 1648 }, { "clip_ratio": 0.0, "completion_length": 654.9453125, "epoch": 0.6596, "grad_norm": 0.4284927672317878, "kl": 0.1605224609375, "learning_rate": 6.2668836420348535e-06, "loss": 0.0458, "reward": 2.08984375, "reward_std": 0.17289994657039642, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1649 }, { "clip_ratio": 0.0, "completion_length": 761.203125, "epoch": 0.66, "grad_norm": 0.3235771220305948, "kl": 0.1875, "learning_rate": 6.25393406584088e-06, "loss": 0.082, "reward": 1.953125, "reward_std": 0.47779224812984467, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 1650 }, { "clip_ratio": 0.0, "completion_length": 734.9765625, "epoch": 0.6604, "grad_norm": 0.44915069912261263, "kl": 0.154296875, "learning_rate": 6.240991792794133e-06, "loss": 0.0975, "reward": 1.884765625, "reward_std": 0.3458234593272209, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1651 }, { "clip_ratio": 0.0, "completion_length": 730.203125, "epoch": 0.6608, "grad_norm": 0.5058745396439983, "kl": 0.171630859375, "learning_rate": 6.228056848126236e-06, "loss": 0.0591, "reward": 1.982421875, "reward_std": 0.28103693574666977, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1652 }, { "clip_ratio": 0.0, "completion_length": 663.0078125, "epoch": 0.6612, "grad_norm": 0.36218786414083376, "kl": 0.15185546875, "learning_rate": 6.2151292570545215e-06, "loss": 0.0908, "reward": 1.98046875, "reward_std": 0.4058326408267021, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1653 }, { "clip_ratio": 0.0, "completion_length": 672.6875, "epoch": 0.6616, "grad_norm": 1.70292867546149, "kl": 0.197265625, "learning_rate": 6.202209044781991e-06, "loss": 0.225, "reward": 2.080078125, "reward_std": 0.6564491242170334, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.923828125, "step": 1654 }, { "clip_ratio": 0.0, "completion_length": 690.546875, "epoch": 0.662, "grad_norm": 0.5725331626364715, "kl": 0.1552734375, "learning_rate": 6.18929623649726e-06, "loss": 0.0645, "reward": 1.962890625, "reward_std": 0.3326728940010071, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1655 }, { "clip_ratio": 0.0, "completion_length": 588.3515625, "epoch": 0.6624, "grad_norm": 0.4549399380676271, "kl": 0.174072265625, "learning_rate": 6.176390857374508e-06, "loss": 0.1052, "reward": 1.974609375, "reward_std": 0.539474830031395, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 1656 }, { "clip_ratio": 0.0, "completion_length": 658.234375, "epoch": 0.6628, "grad_norm": 0.281068729920374, "kl": 0.145751953125, "learning_rate": 6.1634929325734385e-06, "loss": 0.1061, "reward": 2.09765625, "reward_std": 0.4054764434695244, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1657 }, { "clip_ratio": 0.0, "completion_length": 634.7265625, "epoch": 0.6632, "grad_norm": 0.3744030622881629, "kl": 0.16455078125, "learning_rate": 6.150602487239207e-06, "loss": 0.1477, "reward": 2.0234375, "reward_std": 0.35443951934576035, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1658 }, { "clip_ratio": 0.0, "completion_length": 699.3125, "epoch": 0.6636, "grad_norm": 0.6748581168084747, "kl": 0.2001953125, "learning_rate": 6.137719546502401e-06, "loss": 0.1401, "reward": 2.044921875, "reward_std": 0.3215351775288582, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1659 }, { "clip_ratio": 0.0, "completion_length": 681.6953125, "epoch": 0.664, "grad_norm": 0.398487991768915, "kl": 0.15234375, "learning_rate": 6.124844135478971e-06, "loss": 0.1087, "reward": 2.01171875, "reward_std": 0.39362695068120956, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1660 }, { "clip_ratio": 0.0, "completion_length": 704.53125, "epoch": 0.6644, "grad_norm": 0.40665513923822033, "kl": 0.23486328125, "learning_rate": 6.1119762792701935e-06, "loss": 0.1388, "reward": 2.037109375, "reward_std": 0.5492773726582527, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.927734375, "step": 1661 }, { "clip_ratio": 0.0, "completion_length": 736.046875, "epoch": 0.6648, "grad_norm": 0.23821534395468522, "kl": 0.16845703125, "learning_rate": 6.099116002962604e-06, "loss": 0.0309, "reward": 1.96484375, "reward_std": 0.3292526826262474, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1662 }, { "clip_ratio": 0.0, "completion_length": 686.53125, "epoch": 0.6652, "grad_norm": 0.3424106515656655, "kl": 0.155517578125, "learning_rate": 6.086263331627976e-06, "loss": 0.1526, "reward": 1.90234375, "reward_std": 0.5344818159937859, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.91796875, "step": 1663 }, { "clip_ratio": 0.0, "completion_length": 634.4453125, "epoch": 0.6656, "grad_norm": 0.4185625198404277, "kl": 0.17724609375, "learning_rate": 6.073418290323251e-06, "loss": 0.1545, "reward": 2.048828125, "reward_std": 0.4556581377983093, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1664 }, { "clip_ratio": 0.0, "completion_length": 716.375, "epoch": 0.666, "grad_norm": 0.2402659194612467, "kl": 0.14501953125, "learning_rate": 6.06058090409049e-06, "loss": 0.0515, "reward": 2.16015625, "reward_std": 0.3164580911397934, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1665 }, { "clip_ratio": 0.0, "completion_length": 731.4296875, "epoch": 0.6664, "grad_norm": 0.2748708749727525, "kl": 0.161865234375, "learning_rate": 6.047751197956838e-06, "loss": 0.0747, "reward": 2.201171875, "reward_std": 0.28035277873277664, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1666 }, { "clip_ratio": 0.0, "completion_length": 709.7109375, "epoch": 0.6668, "grad_norm": 0.4566611989090023, "kl": 0.166748046875, "learning_rate": 6.0349291969344595e-06, "loss": 0.125, "reward": 1.931640625, "reward_std": 0.46964580565690994, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 1667 }, { "clip_ratio": 0.0, "completion_length": 716.875, "epoch": 0.6672, "grad_norm": 0.37679821048889334, "kl": 0.15771484375, "learning_rate": 6.022114926020504e-06, "loss": 0.0645, "reward": 1.962890625, "reward_std": 0.41081366688013077, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.962890625, "step": 1668 }, { "clip_ratio": 0.0, "completion_length": 634.796875, "epoch": 0.6676, "grad_norm": 0.40745039909487313, "kl": 0.1572265625, "learning_rate": 6.009308410197048e-06, "loss": 0.0832, "reward": 2.05859375, "reward_std": 0.30014216899871826, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1669 }, { "clip_ratio": 0.0, "completion_length": 781.078125, "epoch": 0.668, "grad_norm": 0.38126840806283985, "kl": 0.1312255859375, "learning_rate": 5.996509674431053e-06, "loss": 0.0528, "reward": 2.048828125, "reward_std": 0.37042129039764404, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1670 }, { "clip_ratio": 0.0, "completion_length": 633.4921875, "epoch": 0.6684, "grad_norm": 0.29364136822278114, "kl": 0.144287109375, "learning_rate": 5.983718743674302e-06, "loss": 0.1372, "reward": 2.01171875, "reward_std": 0.41253168880939484, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1671 }, { "clip_ratio": 0.0, "completion_length": 633.6328125, "epoch": 0.6688, "grad_norm": 0.23222886842253823, "kl": 0.140380859375, "learning_rate": 5.970935642863375e-06, "loss": 0.0351, "reward": 2.0, "reward_std": 0.1441391110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1672 }, { "clip_ratio": 0.0, "completion_length": 718.0859375, "epoch": 0.6692, "grad_norm": 0.4206340639583534, "kl": 0.150390625, "learning_rate": 5.958160396919577e-06, "loss": 0.0461, "reward": 2.01171875, "reward_std": 0.3313146382570267, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1673 }, { "clip_ratio": 0.0, "completion_length": 716.6953125, "epoch": 0.6696, "grad_norm": 0.27276238124280666, "kl": 0.149169921875, "learning_rate": 5.94539303074891e-06, "loss": 0.0408, "reward": 2.03125, "reward_std": 0.3510126546025276, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1674 }, { "clip_ratio": 0.0, "completion_length": 671.28125, "epoch": 0.67, "grad_norm": 0.47164386169344674, "kl": 0.159423828125, "learning_rate": 5.932633569242e-06, "loss": 0.057, "reward": 1.99609375, "reward_std": 0.3913085162639618, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1675 }, { "clip_ratio": 0.0, "completion_length": 696.0625, "epoch": 0.6704, "grad_norm": 0.1988861229564426, "kl": 0.1318359375, "learning_rate": 5.9198820372740726e-06, "loss": 0.0343, "reward": 1.99609375, "reward_std": 0.1597641110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1676 }, { "clip_ratio": 0.0, "completion_length": 649.8359375, "epoch": 0.6708, "grad_norm": 0.6633969312934488, "kl": 0.156982421875, "learning_rate": 5.907138459704895e-06, "loss": 0.1192, "reward": 2.11328125, "reward_std": 0.41158032417297363, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1677 }, { "clip_ratio": 0.0, "completion_length": 756.0546875, "epoch": 0.6712, "grad_norm": 0.26356051362753974, "kl": 0.1280517578125, "learning_rate": 5.894402861378721e-06, "loss": 0.0612, "reward": 1.916015625, "reward_std": 0.2962058112025261, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1678 }, { "clip_ratio": 0.0, "completion_length": 670.7109375, "epoch": 0.6716, "grad_norm": 1.044120123311122, "kl": 0.1650390625, "learning_rate": 5.881675267124254e-06, "loss": 0.0465, "reward": 2.17578125, "reward_std": 0.25381384789943695, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1679 }, { "clip_ratio": 0.0, "completion_length": 730.0546875, "epoch": 0.672, "grad_norm": 0.258367442505849, "kl": 0.14599609375, "learning_rate": 5.868955701754584e-06, "loss": 0.0894, "reward": 1.892578125, "reward_std": 0.35451073944568634, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1680 }, { "clip_ratio": 0.0, "completion_length": 678.0234375, "epoch": 0.6724, "grad_norm": 0.6691046078617712, "kl": 0.216796875, "learning_rate": 5.85624419006716e-06, "loss": 0.1364, "reward": 1.84765625, "reward_std": 0.43515094369649887, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 1681 }, { "clip_ratio": 0.0, "completion_length": 704.265625, "epoch": 0.6728, "grad_norm": 0.39108125808715155, "kl": 0.16064453125, "learning_rate": 5.843540756843722e-06, "loss": 0.1098, "reward": 1.873046875, "reward_std": 0.3655601888895035, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 1682 }, { "clip_ratio": 0.0, "completion_length": 622.5703125, "epoch": 0.6732, "grad_norm": 0.47329046421673293, "kl": 0.2080078125, "learning_rate": 5.830845426850268e-06, "loss": 0.1253, "reward": 1.87109375, "reward_std": 0.411760613322258, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 1683 }, { "clip_ratio": 0.0, "completion_length": 695.8515625, "epoch": 0.6736, "grad_norm": 0.44462249277612853, "kl": 0.17138671875, "learning_rate": 5.818158224836987e-06, "loss": 0.0958, "reward": 1.8984375, "reward_std": 0.4782891497015953, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9375, "step": 1684 }, { "clip_ratio": 0.0, "completion_length": 665.171875, "epoch": 0.674, "grad_norm": 0.33762326443600615, "kl": 0.166259765625, "learning_rate": 5.8054791755382286e-06, "loss": 0.1858, "reward": 1.888671875, "reward_std": 0.584720715880394, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.912109375, "step": 1685 }, { "clip_ratio": 0.0, "completion_length": 659.671875, "epoch": 0.6744, "grad_norm": 0.2694027747344713, "kl": 0.16064453125, "learning_rate": 5.792808303672454e-06, "loss": 0.0647, "reward": 2.001953125, "reward_std": 0.20827669650316238, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1686 }, { "clip_ratio": 0.0, "completion_length": 660.109375, "epoch": 0.6748, "grad_norm": 0.3624289814656988, "kl": 0.1553955078125, "learning_rate": 5.780145633942173e-06, "loss": 0.1537, "reward": 1.951171875, "reward_std": 0.46820078045129776, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.919921875, "step": 1687 }, { "clip_ratio": 0.0, "completion_length": 608.8125, "epoch": 0.6752, "grad_norm": 0.33635635742732745, "kl": 0.1414794921875, "learning_rate": 5.7674911910339094e-06, "loss": 0.0665, "reward": 2.296875, "reward_std": 0.35245805978775024, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1688 }, { "clip_ratio": 0.0, "completion_length": 713.3359375, "epoch": 0.6756, "grad_norm": 0.28836373126958975, "kl": 0.162841796875, "learning_rate": 5.754844999618144e-06, "loss": 0.0753, "reward": 1.921875, "reward_std": 0.27784234285354614, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1689 }, { "clip_ratio": 0.0, "completion_length": 649.71875, "epoch": 0.676, "grad_norm": 0.2240454826165483, "kl": 0.12744140625, "learning_rate": 5.742207084349274e-06, "loss": 0.0693, "reward": 2.078125, "reward_std": 0.2370612919330597, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1690 }, { "clip_ratio": 0.0, "completion_length": 699.3125, "epoch": 0.6764, "grad_norm": 0.3941552127807068, "kl": 0.16357421875, "learning_rate": 5.729577469865566e-06, "loss": 0.0895, "reward": 2.013671875, "reward_std": 0.40029677748680115, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 1691 }, { "clip_ratio": 0.0, "completion_length": 591.0703125, "epoch": 0.6768, "grad_norm": 0.3066600279907723, "kl": 0.14794921875, "learning_rate": 5.716956180789098e-06, "loss": 0.071, "reward": 2.103515625, "reward_std": 0.29529647529125214, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1692 }, { "clip_ratio": 0.0, "completion_length": 687.6015625, "epoch": 0.6772, "grad_norm": 0.49670533737544476, "kl": 0.1455078125, "learning_rate": 5.704343241725719e-06, "loss": 0.0276, "reward": 2.11328125, "reward_std": 0.15070747584104538, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1693 }, { "clip_ratio": 0.0, "completion_length": 682.796875, "epoch": 0.6776, "grad_norm": 0.31104728452117436, "kl": 0.147216796875, "learning_rate": 5.691738677265e-06, "loss": 0.134, "reward": 2.029296875, "reward_std": 0.4843241274356842, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1694 }, { "clip_ratio": 0.0, "completion_length": 684.984375, "epoch": 0.678, "grad_norm": 0.8406113972700409, "kl": 0.142578125, "learning_rate": 5.679142511980176e-06, "loss": 0.0497, "reward": 1.986328125, "reward_std": 0.2931794673204422, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1695 }, { "clip_ratio": 0.0, "completion_length": 666.8984375, "epoch": 0.6784, "grad_norm": 0.29180179668165895, "kl": 0.142822265625, "learning_rate": 5.666554770428129e-06, "loss": 0.1069, "reward": 1.939453125, "reward_std": 0.3214595764875412, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1696 }, { "clip_ratio": 0.0, "completion_length": 745.3671875, "epoch": 0.6788, "grad_norm": 0.2508109363019894, "kl": 0.145751953125, "learning_rate": 5.653975477149298e-06, "loss": 0.0536, "reward": 1.94921875, "reward_std": 0.2994270622730255, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1697 }, { "clip_ratio": 0.0, "completion_length": 712.984375, "epoch": 0.6792, "grad_norm": 0.24387431362166703, "kl": 0.1494140625, "learning_rate": 5.641404656667661e-06, "loss": 0.0319, "reward": 2.162109375, "reward_std": 0.20572787523269653, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 1698 }, { "clip_ratio": 0.0, "completion_length": 663.46875, "epoch": 0.6796, "grad_norm": 0.30819543520439097, "kl": 0.140869140625, "learning_rate": 5.628842333490674e-06, "loss": 0.0723, "reward": 2.033203125, "reward_std": 0.3358974829316139, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1699 }, { "clip_ratio": 0.0, "completion_length": 713.84375, "epoch": 0.68, "grad_norm": 0.5531459148980116, "kl": 0.1650390625, "learning_rate": 5.616288532109225e-06, "loss": 0.1234, "reward": 2.08203125, "reward_std": 0.3198796659708023, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.93359375, "step": 1700 }, { "clip_ratio": 0.0, "completion_length": 679.3984375, "epoch": 0.6804, "grad_norm": 0.611070754122924, "kl": 0.157470703125, "learning_rate": 5.603743276997607e-06, "loss": 0.0743, "reward": 1.962890625, "reward_std": 0.34727635979652405, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1701 }, { "clip_ratio": 0.0, "completion_length": 732.125, "epoch": 0.6808, "grad_norm": 0.24283715106254358, "kl": 0.1348876953125, "learning_rate": 5.591206592613416e-06, "loss": 0.0264, "reward": 2.09375, "reward_std": 0.2582988888025284, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9921875, "step": 1702 }, { "clip_ratio": 0.0, "completion_length": 647.1171875, "epoch": 0.6812, "grad_norm": 0.6460265779299075, "kl": 0.127197265625, "learning_rate": 5.5786785033975745e-06, "loss": 0.0744, "reward": 1.984375, "reward_std": 0.24340169876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1703 }, { "clip_ratio": 0.0, "completion_length": 737.9296875, "epoch": 0.6816, "grad_norm": 0.39084599224142874, "kl": 0.154296875, "learning_rate": 5.5661590337742255e-06, "loss": 0.0949, "reward": 2.171875, "reward_std": 0.3543400391936302, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96875, "step": 1704 }, { "clip_ratio": 0.0, "completion_length": 728.3828125, "epoch": 0.682, "grad_norm": 0.35208285001559014, "kl": 0.1669921875, "learning_rate": 5.553648208150728e-06, "loss": 0.1118, "reward": 1.857421875, "reward_std": 0.3995512053370476, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 1705 }, { "clip_ratio": 0.0, "completion_length": 719.421875, "epoch": 0.6824, "grad_norm": 0.3825580396813617, "kl": 0.167724609375, "learning_rate": 5.5411460509175605e-06, "loss": 0.1396, "reward": 2.103515625, "reward_std": 0.5003590360283852, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.939453125, "step": 1706 }, { "clip_ratio": 0.0, "completion_length": 696.4765625, "epoch": 0.6828, "grad_norm": 0.3033632851684795, "kl": 0.139404296875, "learning_rate": 5.5286525864483285e-06, "loss": 0.1427, "reward": 2.13671875, "reward_std": 0.49519604444503784, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 1707 }, { "clip_ratio": 0.0, "completion_length": 778.9453125, "epoch": 0.6832, "grad_norm": 0.41870506635889804, "kl": 0.1728515625, "learning_rate": 5.516167839099679e-06, "loss": 0.1279, "reward": 1.755859375, "reward_std": 0.6129872500896454, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.896484375, "step": 1708 }, { "clip_ratio": 0.0, "completion_length": 702.140625, "epoch": 0.6836, "grad_norm": 0.3599197167527492, "kl": 0.1640625, "learning_rate": 5.50369183321126e-06, "loss": 0.1077, "reward": 1.94140625, "reward_std": 0.4635615795850754, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1709 }, { "clip_ratio": 0.0, "completion_length": 675.5390625, "epoch": 0.684, "grad_norm": 1.4512436934239255, "kl": 0.25830078125, "learning_rate": 5.491224593105695e-06, "loss": 0.215, "reward": 2.0078125, "reward_std": 0.6188599020242691, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.8984375, "step": 1710 }, { "clip_ratio": 0.0, "completion_length": 672.8125, "epoch": 0.6844, "grad_norm": 0.5117887429050737, "kl": 0.167724609375, "learning_rate": 5.478766143088492e-06, "loss": 0.1241, "reward": 1.9140625, "reward_std": 0.4400438368320465, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1711 }, { "clip_ratio": 0.0, "completion_length": 734.15625, "epoch": 0.6848, "grad_norm": 0.3815717996703037, "kl": 0.1455078125, "learning_rate": 5.466316507448049e-06, "loss": 0.0775, "reward": 1.962890625, "reward_std": 0.37045468389987946, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1712 }, { "clip_ratio": 0.0, "completion_length": 668.203125, "epoch": 0.6852, "grad_norm": 0.4079519718197381, "kl": 0.1463623046875, "learning_rate": 5.453875710455562e-06, "loss": 0.1645, "reward": 2.1484375, "reward_std": 0.5081270858645439, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9296875, "step": 1713 }, { "clip_ratio": 0.0, "completion_length": 679.9453125, "epoch": 0.6856, "grad_norm": 0.48927056203380653, "kl": 0.169921875, "learning_rate": 5.441443776365003e-06, "loss": 0.1262, "reward": 1.923828125, "reward_std": 0.5332425832748413, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.892578125, "step": 1714 }, { "clip_ratio": 0.0, "completion_length": 794.6875, "epoch": 0.686, "grad_norm": 0.41948980231176297, "kl": 0.178466796875, "learning_rate": 5.429020729413062e-06, "loss": 0.1205, "reward": 1.818359375, "reward_std": 0.6508505940437317, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.865234375, "step": 1715 }, { "clip_ratio": 0.0, "completion_length": 762.875, "epoch": 0.6864, "grad_norm": 0.38514263879123367, "kl": 0.174072265625, "learning_rate": 5.416606593819102e-06, "loss": 0.1066, "reward": 1.802734375, "reward_std": 0.48727037757635117, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.912109375, "step": 1716 }, { "clip_ratio": 0.0, "completion_length": 666.8359375, "epoch": 0.6868, "grad_norm": 0.3986982323867487, "kl": 0.14501953125, "learning_rate": 5.404201393785123e-06, "loss": 0.1505, "reward": 1.970703125, "reward_std": 0.5284249186515808, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.931640625, "step": 1717 }, { "clip_ratio": 0.0, "completion_length": 655.953125, "epoch": 0.6872, "grad_norm": 0.5142658390323906, "kl": 0.15283203125, "learning_rate": 5.391805153495693e-06, "loss": 0.1516, "reward": 2.126953125, "reward_std": 0.5226849988102913, "rewards/accuracy_reward": 0.3203125, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.923828125, "step": 1718 }, { "clip_ratio": 0.0, "completion_length": 776.5390625, "epoch": 0.6876, "grad_norm": 0.3477051386402458, "kl": 0.16796875, "learning_rate": 5.379417897117917e-06, "loss": 0.1259, "reward": 1.955078125, "reward_std": 0.6281413733959198, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.900390625, "step": 1719 }, { "clip_ratio": 0.0, "completion_length": 723.8046875, "epoch": 0.688, "grad_norm": 0.31366865266787247, "kl": 0.182861328125, "learning_rate": 5.367039648801386e-06, "loss": 0.1552, "reward": 1.9375, "reward_std": 0.5046926066279411, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 1720 }, { "clip_ratio": 0.0, "completion_length": 757.8359375, "epoch": 0.6884, "grad_norm": 0.26691742989670675, "kl": 0.147705078125, "learning_rate": 5.354670432678124e-06, "loss": 0.115, "reward": 1.818359375, "reward_std": 0.5130340084433556, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.919921875, "step": 1721 }, { "clip_ratio": 0.0, "completion_length": 789.2421875, "epoch": 0.6888, "grad_norm": 0.2946816978876066, "kl": 0.16943359375, "learning_rate": 5.342310272862558e-06, "loss": 0.0715, "reward": 1.845703125, "reward_std": 0.516072116792202, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.931640625, "step": 1722 }, { "clip_ratio": 0.0, "completion_length": 709.421875, "epoch": 0.6892, "grad_norm": 0.42229323852197265, "kl": 0.162841796875, "learning_rate": 5.3299591934514485e-06, "loss": 0.1129, "reward": 1.83203125, "reward_std": 0.43567828088998795, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.93359375, "step": 1723 }, { "clip_ratio": 0.0, "completion_length": 696.71875, "epoch": 0.6896, "grad_norm": 2.2872345655213433, "kl": 0.323486328125, "learning_rate": 5.317617218523856e-06, "loss": 0.0652, "reward": 2.09765625, "reward_std": 0.2750801518559456, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1724 }, { "clip_ratio": 0.0, "completion_length": 726.3828125, "epoch": 0.69, "grad_norm": 0.505549064837362, "kl": 0.1982421875, "learning_rate": 5.305284372141095e-06, "loss": 0.103, "reward": 1.939453125, "reward_std": 0.4874589964747429, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.916015625, "step": 1725 }, { "clip_ratio": 0.0, "completion_length": 643.03125, "epoch": 0.6904, "grad_norm": 0.4376618440548234, "kl": 0.1552734375, "learning_rate": 5.292960678346674e-06, "loss": 0.1124, "reward": 1.916015625, "reward_std": 0.5134273916482925, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.916015625, "step": 1726 }, { "clip_ratio": 0.0, "completion_length": 676.2890625, "epoch": 0.6908, "grad_norm": 0.346470384025337, "kl": 0.15869140625, "learning_rate": 5.280646161166274e-06, "loss": 0.1367, "reward": 1.94140625, "reward_std": 0.3461521565914154, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.91796875, "step": 1727 }, { "clip_ratio": 0.0, "completion_length": 778.2578125, "epoch": 0.6912, "grad_norm": 0.5946328091817478, "kl": 0.1572265625, "learning_rate": 5.26834084460767e-06, "loss": 0.0978, "reward": 1.861328125, "reward_std": 0.5073713585734367, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.931640625, "step": 1728 }, { "clip_ratio": 0.0, "completion_length": 723.046875, "epoch": 0.6916, "grad_norm": 0.8734574380565441, "kl": 0.151123046875, "learning_rate": 5.256044752660709e-06, "loss": 0.0479, "reward": 2.32421875, "reward_std": 0.1744270622730255, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1729 }, { "clip_ratio": 0.0, "completion_length": 680.7578125, "epoch": 0.692, "grad_norm": 0.32211102117737883, "kl": 0.16748046875, "learning_rate": 5.243757909297247e-06, "loss": 0.0555, "reward": 2.0234375, "reward_std": 0.33842839300632477, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1730 }, { "clip_ratio": 0.0, "completion_length": 692.6875, "epoch": 0.6924, "grad_norm": 0.5396234591038553, "kl": 0.18603515625, "learning_rate": 5.23148033847112e-06, "loss": 0.115, "reward": 1.8984375, "reward_std": 0.5133310630917549, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1731 }, { "clip_ratio": 0.0, "completion_length": 658.765625, "epoch": 0.6928, "grad_norm": 0.9199397567675403, "kl": 0.1572265625, "learning_rate": 5.219212064118079e-06, "loss": 0.1449, "reward": 1.875, "reward_std": 0.4328666031360626, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1732 }, { "clip_ratio": 0.0, "completion_length": 776.7109375, "epoch": 0.6932, "grad_norm": 2.083695093383946, "kl": 0.888916015625, "learning_rate": 5.2069531101557505e-06, "loss": 0.1087, "reward": 1.966796875, "reward_std": 0.438191756606102, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.951171875, "step": 1733 }, { "clip_ratio": 0.0, "completion_length": 751.4765625, "epoch": 0.6936, "grad_norm": 0.3603817959643571, "kl": 0.143798828125, "learning_rate": 5.194703500483593e-06, "loss": 0.0952, "reward": 1.890625, "reward_std": 0.36096110939979553, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 1734 }, { "clip_ratio": 0.0, "completion_length": 728.6171875, "epoch": 0.694, "grad_norm": 0.816285445595864, "kl": 0.1580810546875, "learning_rate": 5.1824632589828465e-06, "loss": 0.0746, "reward": 1.90234375, "reward_std": 0.42350734025239944, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 1735 }, { "clip_ratio": 0.0, "completion_length": 795.6328125, "epoch": 0.6944, "grad_norm": 0.3844017698806344, "kl": 0.1689453125, "learning_rate": 5.1702324095164955e-06, "loss": 0.0623, "reward": 1.998046875, "reward_std": 0.3918011784553528, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.943359375, "step": 1736 }, { "clip_ratio": 0.0, "completion_length": 811.3671875, "epoch": 0.6948, "grad_norm": 0.3750944706173702, "kl": 0.188232421875, "learning_rate": 5.158010975929193e-06, "loss": 0.097, "reward": 1.78125, "reward_std": 0.48607219010591507, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.8984375, "step": 1737 }, { "clip_ratio": 0.0, "completion_length": 715.40625, "epoch": 0.6952, "grad_norm": 0.4501913619822988, "kl": 0.1534423828125, "learning_rate": 5.145798982047261e-06, "loss": 0.1093, "reward": 1.93359375, "reward_std": 0.46249718219041824, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.93359375, "step": 1738 }, { "clip_ratio": 0.0, "completion_length": 738.2578125, "epoch": 0.6956, "grad_norm": 0.3053623989042861, "kl": 0.157470703125, "learning_rate": 5.133596451678603e-06, "loss": 0.0607, "reward": 1.998046875, "reward_std": 0.3151264563202858, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1739 }, { "clip_ratio": 0.0, "completion_length": 763.515625, "epoch": 0.696, "grad_norm": 0.5654176120744705, "kl": 0.17138671875, "learning_rate": 5.121403408612672e-06, "loss": 0.1068, "reward": 1.9140625, "reward_std": 0.4468134567141533, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.9296875, "step": 1740 }, { "clip_ratio": 0.0, "completion_length": 815.4453125, "epoch": 0.6964, "grad_norm": 0.700821388516419, "kl": 0.147705078125, "learning_rate": 5.109219876620441e-06, "loss": 0.1108, "reward": 1.9453125, "reward_std": 0.5002385377883911, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.9296875, "step": 1741 }, { "clip_ratio": 0.0, "completion_length": 739.109375, "epoch": 0.6968, "grad_norm": 0.3476313201344831, "kl": 0.166748046875, "learning_rate": 5.0970458794543135e-06, "loss": 0.0953, "reward": 1.8984375, "reward_std": 0.4375499486923218, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1742 }, { "clip_ratio": 0.0, "completion_length": 717.8671875, "epoch": 0.6972, "grad_norm": 0.4724600042583214, "kl": 0.13671875, "learning_rate": 5.0848814408481305e-06, "loss": 0.1299, "reward": 1.923828125, "reward_std": 0.5030049309134483, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 1743 }, { "clip_ratio": 0.0, "completion_length": 705.0, "epoch": 0.6976, "grad_norm": 0.33223018595429515, "kl": 0.158935546875, "learning_rate": 5.072726584517086e-06, "loss": 0.1281, "reward": 1.845703125, "reward_std": 0.4593522921204567, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.923828125, "step": 1744 }, { "clip_ratio": 0.0, "completion_length": 691.65625, "epoch": 0.698, "grad_norm": 0.6257356113753417, "kl": 0.171142578125, "learning_rate": 5.060581334157693e-06, "loss": 0.1717, "reward": 1.869140625, "reward_std": 0.54739560931921, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.916015625, "step": 1745 }, { "clip_ratio": 0.0, "completion_length": 584.2421875, "epoch": 0.6984, "grad_norm": 1.4458963110252685, "kl": 0.31982421875, "learning_rate": 5.048445713447738e-06, "loss": 0.1329, "reward": 1.98046875, "reward_std": 0.30899855494499207, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1746 }, { "clip_ratio": 0.0, "completion_length": 602.5546875, "epoch": 0.6988, "grad_norm": 1.3469579041998, "kl": 0.240234375, "learning_rate": 5.036319746046232e-06, "loss": 0.0734, "reward": 2.025390625, "reward_std": 0.33248014748096466, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 1747 }, { "clip_ratio": 0.0, "completion_length": 717.3671875, "epoch": 0.6992, "grad_norm": 0.9149236517747326, "kl": 0.2236328125, "learning_rate": 5.024203455593375e-06, "loss": 0.141, "reward": 2.033203125, "reward_std": 0.528538890182972, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.923828125, "step": 1748 }, { "clip_ratio": 0.0, "completion_length": 705.7421875, "epoch": 0.6996, "grad_norm": 1.8220452442270945, "kl": 0.298828125, "learning_rate": 5.012096865710494e-06, "loss": 0.197, "reward": 1.91015625, "reward_std": 0.6190674006938934, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.90234375, "step": 1749 }, { "clip_ratio": 0.0, "completion_length": 731.1875, "epoch": 0.7, "grad_norm": 0.4017575159736881, "kl": 0.16259765625, "learning_rate": 5.000000000000003e-06, "loss": 0.0941, "reward": 2.029296875, "reward_std": 0.43697383999824524, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 709.1484375, "epoch": 0.7004, "grad_norm": 0.4750243803515266, "kl": 0.19873046875, "learning_rate": 4.98791288204536e-06, "loss": 0.1221, "reward": 1.826171875, "reward_std": 0.46067044883966446, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.904296875, "step": 1751 }, { "clip_ratio": 0.0, "completion_length": 676.34375, "epoch": 0.7008, "grad_norm": 0.4314588436919711, "kl": 0.1865234375, "learning_rate": 4.97583553541102e-06, "loss": 0.1153, "reward": 1.93359375, "reward_std": 0.42766745388507843, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.93359375, "step": 1752 }, { "clip_ratio": 0.0, "completion_length": 764.9765625, "epoch": 0.7012, "grad_norm": 0.4151843098914702, "kl": 0.15234375, "learning_rate": 4.9637679836423926e-06, "loss": 0.0984, "reward": 1.9375, "reward_std": 0.5642687976360321, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 1753 }, { "clip_ratio": 0.0, "completion_length": 675.9609375, "epoch": 0.7016, "grad_norm": 1.087751501070932, "kl": 0.197998046875, "learning_rate": 4.951710250265785e-06, "loss": 0.1979, "reward": 1.775390625, "reward_std": 0.49454937130212784, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.900390625, "step": 1754 }, { "clip_ratio": 0.0, "completion_length": 702.2109375, "epoch": 0.702, "grad_norm": 0.30271674058452097, "kl": 0.17431640625, "learning_rate": 4.939662358788364e-06, "loss": 0.0788, "reward": 2.041015625, "reward_std": 0.46353983134031296, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.923828125, "step": 1755 }, { "clip_ratio": 0.0, "completion_length": 739.171875, "epoch": 0.7024, "grad_norm": 0.9460210684539612, "kl": 0.186279296875, "learning_rate": 4.927624332698109e-06, "loss": 0.1349, "reward": 1.876953125, "reward_std": 0.5689611434936523, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.900390625, "step": 1756 }, { "clip_ratio": 0.0, "completion_length": 711.8203125, "epoch": 0.7028, "grad_norm": 1.4466216187612082, "kl": 0.2265625, "learning_rate": 4.915596195463773e-06, "loss": 0.1863, "reward": 1.740234375, "reward_std": 0.6241319924592972, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.8203125, "rewards/tag_count_reward": 0.873046875, "step": 1757 }, { "clip_ratio": 0.0, "completion_length": 796.140625, "epoch": 0.7032, "grad_norm": 0.27420927881619284, "kl": 0.149658203125, "learning_rate": 4.903577970534823e-06, "loss": 0.0715, "reward": 1.8984375, "reward_std": 0.41693519055843353, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1758 }, { "clip_ratio": 0.0, "completion_length": 629.6640625, "epoch": 0.7036, "grad_norm": 0.46707582744109094, "kl": 0.1767578125, "learning_rate": 4.891569681341403e-06, "loss": 0.088, "reward": 2.078125, "reward_std": 0.42931613326072693, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1759 }, { "clip_ratio": 0.0, "completion_length": 704.9609375, "epoch": 0.704, "grad_norm": 0.2949767515406735, "kl": 0.16259765625, "learning_rate": 4.879571351294287e-06, "loss": 0.1036, "reward": 1.9453125, "reward_std": 0.40670308470726013, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1760 }, { "clip_ratio": 0.0, "completion_length": 682.15625, "epoch": 0.7044, "grad_norm": 0.30908517425412985, "kl": 0.1552734375, "learning_rate": 4.8675830037848295e-06, "loss": 0.0936, "reward": 1.966796875, "reward_std": 0.3267252892255783, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1761 }, { "clip_ratio": 0.0, "completion_length": 784.1015625, "epoch": 0.7048, "grad_norm": 0.2630917617335723, "kl": 0.1361083984375, "learning_rate": 4.855604662184935e-06, "loss": 0.034, "reward": 1.99609375, "reward_std": 0.2965452969074249, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.95703125, "step": 1762 }, { "clip_ratio": 0.0, "completion_length": 684.828125, "epoch": 0.7052, "grad_norm": 0.39302262966584883, "kl": 0.16064453125, "learning_rate": 4.843636349846991e-06, "loss": 0.1257, "reward": 2.056640625, "reward_std": 0.47440948337316513, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1763 }, { "clip_ratio": 0.0, "completion_length": 749.3125, "epoch": 0.7056, "grad_norm": 0.404293154129061, "kl": 0.16162109375, "learning_rate": 4.831678090103832e-06, "loss": 0.113, "reward": 1.857421875, "reward_std": 0.4954274371266365, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.935546875, "step": 1764 }, { "clip_ratio": 0.0, "completion_length": 665.359375, "epoch": 0.706, "grad_norm": 2.276508975003625, "kl": 0.157958984375, "learning_rate": 4.8197299062687e-06, "loss": 0.1131, "reward": 2.0, "reward_std": 0.4029608368873596, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1765 }, { "clip_ratio": 0.0, "completion_length": 756.0078125, "epoch": 0.7064, "grad_norm": 0.5268295845055778, "kl": 0.218505859375, "learning_rate": 4.807791821635186e-06, "loss": 0.166, "reward": 1.82421875, "reward_std": 0.5646712481975555, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.87109375, "step": 1766 }, { "clip_ratio": 0.0, "completion_length": 733.1640625, "epoch": 0.7068, "grad_norm": 0.6165190936580701, "kl": 0.13818359375, "learning_rate": 4.795863859477207e-06, "loss": 0.0445, "reward": 2.013671875, "reward_std": 0.20258088409900665, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1767 }, { "clip_ratio": 0.0, "completion_length": 712.1171875, "epoch": 0.7072, "grad_norm": 4.154466339758933, "kl": 0.281005859375, "learning_rate": 4.783946043048922e-06, "loss": 0.1497, "reward": 2.0078125, "reward_std": 0.3191300332546234, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 1768 }, { "clip_ratio": 0.0, "completion_length": 743.859375, "epoch": 0.7076, "grad_norm": 0.25500726744649355, "kl": 0.15478515625, "learning_rate": 4.772038395584735e-06, "loss": 0.0738, "reward": 1.90234375, "reward_std": 0.31623566895723343, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1769 }, { "clip_ratio": 0.0, "completion_length": 705.4765625, "epoch": 0.708, "grad_norm": 0.8458356476576152, "kl": 0.199462890625, "learning_rate": 4.76014094029921e-06, "loss": 0.2099, "reward": 1.908203125, "reward_std": 0.5829346925020218, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.900390625, "step": 1770 }, { "clip_ratio": 0.0, "completion_length": 790.296875, "epoch": 0.7084, "grad_norm": 0.3539718280143462, "kl": 0.17041015625, "learning_rate": 4.7482537003870425e-06, "loss": 0.0587, "reward": 1.9296875, "reward_std": 0.32361215353012085, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1771 }, { "clip_ratio": 0.0, "completion_length": 707.5, "epoch": 0.7088, "grad_norm": 1.2860292620599025, "kl": 0.1474609375, "learning_rate": 4.736376699023023e-06, "loss": 0.1396, "reward": 2.005859375, "reward_std": 0.49077461659908295, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 1772 }, { "clip_ratio": 0.0, "completion_length": 689.1484375, "epoch": 0.7092, "grad_norm": 0.570659794087851, "kl": 0.1943359375, "learning_rate": 4.724509959361961e-06, "loss": 0.1398, "reward": 1.76171875, "reward_std": 0.4745500683784485, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.88671875, "step": 1773 }, { "clip_ratio": 0.0, "completion_length": 728.84375, "epoch": 0.7096, "grad_norm": 1.0348209882137493, "kl": 0.1572265625, "learning_rate": 4.712653504538684e-06, "loss": 0.1393, "reward": 1.90234375, "reward_std": 0.505212277173996, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.91015625, "step": 1774 }, { "clip_ratio": 0.0, "completion_length": 747.2578125, "epoch": 0.71, "grad_norm": 0.7779622842609188, "kl": 0.181640625, "learning_rate": 4.700807357667953e-06, "loss": 0.1867, "reward": 1.791015625, "reward_std": 0.7607593983411789, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.869140625, "step": 1775 }, { "clip_ratio": 0.0, "completion_length": 681.0, "epoch": 0.7104, "grad_norm": 0.5766967583779418, "kl": 0.1728515625, "learning_rate": 4.688971541844436e-06, "loss": 0.1092, "reward": 1.951171875, "reward_std": 0.44873448461294174, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.927734375, "step": 1776 }, { "clip_ratio": 0.0, "completion_length": 773.296875, "epoch": 0.7108, "grad_norm": 1.5610470764122648, "kl": 0.2314453125, "learning_rate": 4.677146080142664e-06, "loss": 0.1792, "reward": 1.66796875, "reward_std": 0.6101634949445724, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.78125, "rewards/tag_count_reward": 0.84765625, "step": 1777 }, { "clip_ratio": 0.0, "completion_length": 693.0078125, "epoch": 0.7112, "grad_norm": 0.9550113795930188, "kl": 0.184814453125, "learning_rate": 4.6653309956169745e-06, "loss": 0.2097, "reward": 1.875, "reward_std": 0.6310351341962814, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.8984375, "step": 1778 }, { "clip_ratio": 0.0, "completion_length": 661.671875, "epoch": 0.7116, "grad_norm": 0.753984300747995, "kl": 0.228271484375, "learning_rate": 4.6535263113014885e-06, "loss": 0.0997, "reward": 1.892578125, "reward_std": 0.49566682428121567, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.916015625, "step": 1779 }, { "clip_ratio": 0.0, "completion_length": 632.09375, "epoch": 0.712, "grad_norm": 0.6526260678793652, "kl": 0.162109375, "learning_rate": 4.641732050210032e-06, "loss": 0.126, "reward": 1.86328125, "reward_std": 0.4297672063112259, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.91015625, "step": 1780 }, { "clip_ratio": 0.0, "completion_length": 738.296875, "epoch": 0.7124, "grad_norm": 0.6210618605433521, "kl": 0.19287109375, "learning_rate": 4.629948235336133e-06, "loss": 0.0659, "reward": 1.947265625, "reward_std": 0.3956039324402809, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 1781 }, { "clip_ratio": 0.0, "completion_length": 698.1953125, "epoch": 0.7128, "grad_norm": 0.3954858638727855, "kl": 0.161376953125, "learning_rate": 4.618174889652928e-06, "loss": 0.1213, "reward": 2.091796875, "reward_std": 0.418451152741909, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.919921875, "step": 1782 }, { "clip_ratio": 0.0, "completion_length": 680.5234375, "epoch": 0.7132, "grad_norm": 0.5525921733869903, "kl": 0.177978515625, "learning_rate": 4.606412036113166e-06, "loss": 0.1294, "reward": 1.890625, "reward_std": 0.4920841231942177, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 1783 }, { "clip_ratio": 0.0, "completion_length": 779.4296875, "epoch": 0.7136, "grad_norm": 20.88550032769505, "kl": 2.194580078125, "learning_rate": 4.59465969764913e-06, "loss": 0.1391, "reward": 1.943359375, "reward_std": 0.416276253759861, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1784 }, { "clip_ratio": 0.0, "completion_length": 692.46875, "epoch": 0.714, "grad_norm": 0.37994680914786694, "kl": 0.169921875, "learning_rate": 4.582917897172603e-06, "loss": 0.1084, "reward": 1.8671875, "reward_std": 0.3353988751769066, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1785 }, { "clip_ratio": 0.0, "completion_length": 684.640625, "epoch": 0.7144, "grad_norm": 0.5015479165416087, "kl": 0.1943359375, "learning_rate": 4.571186657574828e-06, "loss": 0.1964, "reward": 1.9765625, "reward_std": 0.5691923648118973, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.9140625, "step": 1786 }, { "clip_ratio": 0.0, "completion_length": 730.3046875, "epoch": 0.7148, "grad_norm": 0.23618490467238826, "kl": 0.149658203125, "learning_rate": 4.559466001726451e-06, "loss": 0.0209, "reward": 2.11328125, "reward_std": 0.2963922694325447, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1787 }, { "clip_ratio": 0.0, "completion_length": 671.609375, "epoch": 0.7152, "grad_norm": 0.6190478396627496, "kl": 0.177978515625, "learning_rate": 4.5477559524775e-06, "loss": 0.1129, "reward": 2.162109375, "reward_std": 0.37536413967609406, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1788 }, { "clip_ratio": 0.0, "completion_length": 681.609375, "epoch": 0.7156, "grad_norm": 0.3340703424956092, "kl": 0.14990234375, "learning_rate": 4.53605653265731e-06, "loss": 0.0865, "reward": 1.84765625, "reward_std": 0.2234838604927063, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.93359375, "step": 1789 }, { "clip_ratio": 0.0, "completion_length": 747.4921875, "epoch": 0.716, "grad_norm": 0.19912750823959338, "kl": 0.143798828125, "learning_rate": 4.524367765074499e-06, "loss": 0.0756, "reward": 1.91015625, "reward_std": 0.2949570566415787, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1790 }, { "clip_ratio": 0.0, "completion_length": 745.265625, "epoch": 0.7164, "grad_norm": 0.3429278820369065, "kl": 0.146484375, "learning_rate": 4.512689672516918e-06, "loss": 0.0774, "reward": 2.177734375, "reward_std": 0.5137604773044586, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1791 }, { "clip_ratio": 0.0, "completion_length": 728.5703125, "epoch": 0.7168, "grad_norm": 0.30908362509072995, "kl": 0.17333984375, "learning_rate": 4.501022277751602e-06, "loss": 0.0916, "reward": 1.818359375, "reward_std": 0.3747178837656975, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.912109375, "step": 1792 }, { "clip_ratio": 0.0, "completion_length": 649.96875, "epoch": 0.7172, "grad_norm": 0.3899841123752739, "kl": 0.14697265625, "learning_rate": 4.48936560352474e-06, "loss": 0.1245, "reward": 2.16015625, "reward_std": 0.4128234386444092, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96484375, "step": 1793 }, { "clip_ratio": 0.0, "completion_length": 663.875, "epoch": 0.7176, "grad_norm": 0.72336667605014, "kl": 0.158447265625, "learning_rate": 4.477719672561615e-06, "loss": 0.0685, "reward": 2.185546875, "reward_std": 0.23008864372968674, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1794 }, { "clip_ratio": 0.0, "completion_length": 770.71875, "epoch": 0.718, "grad_norm": 0.29417332236541305, "kl": 0.148193359375, "learning_rate": 4.46608450756656e-06, "loss": 0.0655, "reward": 2.00390625, "reward_std": 0.3861619830131531, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1795 }, { "clip_ratio": 0.0, "completion_length": 681.5078125, "epoch": 0.7184, "grad_norm": 0.21126269642542955, "kl": 0.153076171875, "learning_rate": 4.4544601312229295e-06, "loss": 0.0277, "reward": 2.08203125, "reward_std": 0.15207062661647797, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1796 }, { "clip_ratio": 0.0, "completion_length": 690.921875, "epoch": 0.7188, "grad_norm": 0.597381145898923, "kl": 0.1923828125, "learning_rate": 4.442846566193034e-06, "loss": 0.1942, "reward": 1.70703125, "reward_std": 0.44321712106466293, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.88671875, "step": 1797 }, { "clip_ratio": 0.0, "completion_length": 693.1484375, "epoch": 0.7192, "grad_norm": 0.39499277680749134, "kl": 0.177978515625, "learning_rate": 4.4312438351181246e-06, "loss": 0.0875, "reward": 1.912109375, "reward_std": 0.2719729542732239, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1798 }, { "clip_ratio": 0.0, "completion_length": 706.0859375, "epoch": 0.7196, "grad_norm": 0.5084338453440966, "kl": 0.163818359375, "learning_rate": 4.419651960618302e-06, "loss": 0.1005, "reward": 1.927734375, "reward_std": 0.4368308186531067, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1799 }, { "clip_ratio": 0.0, "completion_length": 652.6484375, "epoch": 0.72, "grad_norm": 0.6754478012679402, "kl": 0.157958984375, "learning_rate": 4.408070965292534e-06, "loss": 0.0941, "reward": 2.064453125, "reward_std": 0.31350262463092804, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 1800 }, { "clip_ratio": 0.0, "completion_length": 708.921875, "epoch": 0.7204, "grad_norm": 2.243328201088769, "kl": 0.178466796875, "learning_rate": 4.3965008717185555e-06, "loss": 0.1179, "reward": 1.927734375, "reward_std": 0.37477757036685944, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1801 }, { "clip_ratio": 0.0, "completion_length": 694.6875, "epoch": 0.7208, "grad_norm": 0.25209361071098396, "kl": 0.14892578125, "learning_rate": 4.384941702452856e-06, "loss": 0.0507, "reward": 2.0078125, "reward_std": 0.2958957478404045, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1802 }, { "clip_ratio": 0.0, "completion_length": 641.46875, "epoch": 0.7212, "grad_norm": 0.2599912153823635, "kl": 0.1669921875, "learning_rate": 4.373393480030637e-06, "loss": 0.0545, "reward": 2.140625, "reward_std": 0.24960751086473465, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1803 }, { "clip_ratio": 0.0, "completion_length": 717.96875, "epoch": 0.7216, "grad_norm": 0.21914197724812962, "kl": 0.147705078125, "learning_rate": 4.361856226965733e-06, "loss": 0.0398, "reward": 2.130859375, "reward_std": 0.1982821300625801, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1804 }, { "clip_ratio": 0.0, "completion_length": 640.9921875, "epoch": 0.722, "grad_norm": 0.7359743709535961, "kl": 0.15869140625, "learning_rate": 4.350329965750622e-06, "loss": 0.0746, "reward": 2.03515625, "reward_std": 0.42910782992839813, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.95703125, "step": 1805 }, { "clip_ratio": 0.0, "completion_length": 676.8359375, "epoch": 0.7224, "grad_norm": 0.3640735640873138, "kl": 0.1796875, "learning_rate": 4.338814718856333e-06, "loss": 0.1062, "reward": 1.91015625, "reward_std": 0.35335463285446167, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1806 }, { "clip_ratio": 0.0, "completion_length": 704.078125, "epoch": 0.7228, "grad_norm": 1.1451527002144706, "kl": 0.160400390625, "learning_rate": 4.3273105087324375e-06, "loss": 0.0289, "reward": 1.962890625, "reward_std": 0.18315474689006805, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 1807 }, { "clip_ratio": 0.0, "completion_length": 683.53125, "epoch": 0.7232, "grad_norm": 0.2420018137865335, "kl": 0.152099609375, "learning_rate": 4.315817357806974e-06, "loss": 0.0685, "reward": 1.93359375, "reward_std": 0.26055096089839935, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1808 }, { "clip_ratio": 0.0, "completion_length": 662.8515625, "epoch": 0.7236, "grad_norm": 0.3659539867368051, "kl": 0.173583984375, "learning_rate": 4.304335288486426e-06, "loss": 0.156, "reward": 1.8671875, "reward_std": 0.4332367554306984, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1809 }, { "clip_ratio": 0.0, "completion_length": 728.0078125, "epoch": 0.724, "grad_norm": 0.20248957972878553, "kl": 0.14697265625, "learning_rate": 4.292864323155684e-06, "loss": 0.1498, "reward": 1.8203125, "reward_std": 0.44978154450654984, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.9140625, "step": 1810 }, { "clip_ratio": 0.0, "completion_length": 679.0859375, "epoch": 0.7244, "grad_norm": 0.3118774451053697, "kl": 0.15625, "learning_rate": 4.281404484177974e-06, "loss": 0.1024, "reward": 1.94140625, "reward_std": 0.47484270483255386, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1811 }, { "clip_ratio": 0.0, "completion_length": 648.4375, "epoch": 0.7248, "grad_norm": 0.352564175748664, "kl": 0.156494140625, "learning_rate": 4.26995579389485e-06, "loss": 0.1201, "reward": 2.15234375, "reward_std": 0.40591733902692795, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1812 }, { "clip_ratio": 0.0, "completion_length": 725.03125, "epoch": 0.7252, "grad_norm": 13.925502284774257, "kl": 0.186279296875, "learning_rate": 4.258518274626103e-06, "loss": 0.176, "reward": 1.8984375, "reward_std": 0.521307185292244, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.921875, "step": 1813 }, { "clip_ratio": 0.0, "completion_length": 670.390625, "epoch": 0.7256, "grad_norm": 0.6555553553410479, "kl": 0.146240234375, "learning_rate": 4.247091948669775e-06, "loss": 0.1492, "reward": 1.93359375, "reward_std": 0.5542127937078476, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.91796875, "step": 1814 }, { "clip_ratio": 0.0, "completion_length": 743.484375, "epoch": 0.726, "grad_norm": 0.5464743746650554, "kl": 0.175048828125, "learning_rate": 4.235676838302069e-06, "loss": 0.105, "reward": 2.013671875, "reward_std": 0.5104482546448708, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.943359375, "step": 1815 }, { "clip_ratio": 0.0, "completion_length": 627.7265625, "epoch": 0.7264, "grad_norm": 0.4311065349074134, "kl": 0.1826171875, "learning_rate": 4.224272965777326e-06, "loss": 0.1169, "reward": 2.09765625, "reward_std": 0.38517172634601593, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1816 }, { "clip_ratio": 0.0, "completion_length": 776.6328125, "epoch": 0.7268, "grad_norm": 0.27026319118097064, "kl": 0.150634765625, "learning_rate": 4.21288035332798e-06, "loss": 0.0429, "reward": 1.947265625, "reward_std": 0.4302191510796547, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 1817 }, { "clip_ratio": 0.0, "completion_length": 650.4375, "epoch": 0.7272, "grad_norm": 0.5531465869608444, "kl": 0.16943359375, "learning_rate": 4.201499023164508e-06, "loss": 0.0716, "reward": 2.05859375, "reward_std": 0.1663939654827118, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1818 }, { "clip_ratio": 0.0, "completion_length": 674.703125, "epoch": 0.7276, "grad_norm": 0.939669337969933, "kl": 0.16259765625, "learning_rate": 4.190128997475402e-06, "loss": 0.0669, "reward": 2.0234375, "reward_std": 0.27672988921403885, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1819 }, { "clip_ratio": 0.0, "completion_length": 691.1484375, "epoch": 0.728, "grad_norm": 0.7536395307201573, "kl": 0.175048828125, "learning_rate": 4.178770298427107e-06, "loss": 0.0767, "reward": 2.005859375, "reward_std": 0.38423699140548706, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 1820 }, { "clip_ratio": 0.0, "completion_length": 743.09375, "epoch": 0.7284, "grad_norm": 0.728468763567274, "kl": 0.178466796875, "learning_rate": 4.167422948163986e-06, "loss": 0.1255, "reward": 1.73828125, "reward_std": 0.4909641370177269, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.88671875, "step": 1821 }, { "clip_ratio": 0.0, "completion_length": 701.0078125, "epoch": 0.7288, "grad_norm": 0.9600453721457922, "kl": 0.171630859375, "learning_rate": 4.15608696880828e-06, "loss": 0.1295, "reward": 1.83984375, "reward_std": 0.3613668903708458, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 1822 }, { "clip_ratio": 0.0, "completion_length": 710.765625, "epoch": 0.7292, "grad_norm": 0.30562936267202967, "kl": 0.142578125, "learning_rate": 4.144762382460059e-06, "loss": 0.0874, "reward": 1.92578125, "reward_std": 0.32471734285354614, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1823 }, { "clip_ratio": 0.0, "completion_length": 660.9765625, "epoch": 0.7296, "grad_norm": 0.4744265749279907, "kl": 0.157470703125, "learning_rate": 4.133449211197188e-06, "loss": 0.1427, "reward": 1.90625, "reward_std": 0.4423728883266449, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1824 }, { "clip_ratio": 0.0, "completion_length": 644.1015625, "epoch": 0.73, "grad_norm": 0.6629794467640465, "kl": 0.155029296875, "learning_rate": 4.12214747707527e-06, "loss": 0.0973, "reward": 2.216796875, "reward_std": 0.3215351775288582, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1825 }, { "clip_ratio": 0.0, "completion_length": 694.59375, "epoch": 0.7304, "grad_norm": 0.3567320287452618, "kl": 0.133544921875, "learning_rate": 4.110857202127615e-06, "loss": 0.0999, "reward": 2.068359375, "reward_std": 0.4935379922389984, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 1826 }, { "clip_ratio": 0.0, "completion_length": 576.203125, "epoch": 0.7308, "grad_norm": 0.3960354585809517, "kl": 0.1336669921875, "learning_rate": 4.099578408365192e-06, "loss": 0.1106, "reward": 1.96484375, "reward_std": 0.23096735030412674, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1827 }, { "clip_ratio": 0.0, "completion_length": 755.71875, "epoch": 0.7312, "grad_norm": 0.2715854821324621, "kl": 0.17724609375, "learning_rate": 4.08831111777658e-06, "loss": 0.0668, "reward": 1.83203125, "reward_std": 0.3278057351708412, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.93359375, "step": 1828 }, { "clip_ratio": 0.0, "completion_length": 732.75, "epoch": 0.7316, "grad_norm": 0.6255755569304944, "kl": 0.18408203125, "learning_rate": 4.0770553523279535e-06, "loss": 0.1254, "reward": 2.10546875, "reward_std": 0.395060658454895, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 1829 }, { "clip_ratio": 0.0, "completion_length": 673.125, "epoch": 0.732, "grad_norm": 0.8866449135703218, "kl": 0.17041015625, "learning_rate": 4.065811133962987e-06, "loss": 0.0993, "reward": 2.0, "reward_std": 0.521259032189846, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9375, "step": 1830 }, { "clip_ratio": 0.0, "completion_length": 657.2890625, "epoch": 0.7324, "grad_norm": 0.740662076234443, "kl": 0.20263671875, "learning_rate": 4.05457848460287e-06, "loss": 0.0912, "reward": 2.005859375, "reward_std": 0.458378866314888, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1831 }, { "clip_ratio": 0.0, "completion_length": 743.15625, "epoch": 0.7328, "grad_norm": 0.5198858635404588, "kl": 0.16162109375, "learning_rate": 4.04335742614622e-06, "loss": 0.0875, "reward": 2.046875, "reward_std": 0.550200380384922, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9296875, "step": 1832 }, { "clip_ratio": 0.0, "completion_length": 732.7421875, "epoch": 0.7332, "grad_norm": 0.36945548720257515, "kl": 0.16357421875, "learning_rate": 4.032147980469072e-06, "loss": 0.0711, "reward": 1.986328125, "reward_std": 0.3165084794163704, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 1833 }, { "clip_ratio": 0.0, "completion_length": 680.53125, "epoch": 0.7336, "grad_norm": 0.395574508278587, "kl": 0.151611328125, "learning_rate": 4.020950169424815e-06, "loss": 0.1667, "reward": 1.91015625, "reward_std": 0.5902808457612991, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.91015625, "step": 1834 }, { "clip_ratio": 0.0, "completion_length": 754.578125, "epoch": 0.734, "grad_norm": 0.2696731529910623, "kl": 0.16845703125, "learning_rate": 4.009764014844143e-06, "loss": 0.0819, "reward": 2.02734375, "reward_std": 0.29167594015598297, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1835 }, { "clip_ratio": 0.0, "completion_length": 678.921875, "epoch": 0.7344, "grad_norm": 0.4099197860031814, "kl": 0.19287109375, "learning_rate": 3.998589538535046e-06, "loss": 0.0625, "reward": 2.236328125, "reward_std": 0.2444654181599617, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1836 }, { "clip_ratio": 0.0, "completion_length": 610.4765625, "epoch": 0.7348, "grad_norm": 0.3500926792679815, "kl": 0.18994140625, "learning_rate": 3.987426762282733e-06, "loss": 0.1051, "reward": 1.99609375, "reward_std": 0.2794423997402191, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1837 }, { "clip_ratio": 0.0, "completion_length": 719.53125, "epoch": 0.7352, "grad_norm": 0.28230838200468056, "kl": 0.145751953125, "learning_rate": 3.976275707849616e-06, "loss": 0.0674, "reward": 2.05859375, "reward_std": 0.29909778386354446, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1838 }, { "clip_ratio": 0.0, "completion_length": 720.90625, "epoch": 0.7356, "grad_norm": 0.30178018158773473, "kl": 0.164306640625, "learning_rate": 3.965136396975235e-06, "loss": 0.0474, "reward": 1.990234375, "reward_std": 0.32409055531024933, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1839 }, { "clip_ratio": 0.0, "completion_length": 710.375, "epoch": 0.736, "grad_norm": 0.7507612819051124, "kl": 0.19140625, "learning_rate": 3.954008851376252e-06, "loss": 0.1465, "reward": 1.80859375, "reward_std": 0.4952165484428406, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.91015625, "step": 1840 }, { "clip_ratio": 0.0, "completion_length": 692.15625, "epoch": 0.7364, "grad_norm": 0.39471820581084127, "kl": 0.174560546875, "learning_rate": 3.942893092746387e-06, "loss": 0.0295, "reward": 1.96484375, "reward_std": 0.18246503919363022, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1841 }, { "clip_ratio": 0.0, "completion_length": 716.6328125, "epoch": 0.7368, "grad_norm": 1.4999515212964138, "kl": 0.156005859375, "learning_rate": 3.931789142756377e-06, "loss": 0.0659, "reward": 1.95703125, "reward_std": 0.3133533075451851, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1842 }, { "clip_ratio": 0.0, "completion_length": 681.921875, "epoch": 0.7372, "grad_norm": 0.31562408962396726, "kl": 0.16650390625, "learning_rate": 3.920697023053949e-06, "loss": 0.0492, "reward": 2.01953125, "reward_std": 0.26221734285354614, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.98046875, "step": 1843 }, { "clip_ratio": 0.0, "completion_length": 682.7265625, "epoch": 0.7376, "grad_norm": 0.2853903985101533, "kl": 0.15869140625, "learning_rate": 3.9096167552637454e-06, "loss": 0.042, "reward": 2.13671875, "reward_std": 0.2962224632501602, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1844 }, { "clip_ratio": 0.0, "completion_length": 697.0546875, "epoch": 0.738, "grad_norm": 1.1735432665943764, "kl": 0.167724609375, "learning_rate": 3.898548360987325e-06, "loss": 0.0847, "reward": 1.99609375, "reward_std": 0.40206409245729446, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.95703125, "step": 1845 }, { "clip_ratio": 0.0, "completion_length": 683.5, "epoch": 0.7384, "grad_norm": 0.31953456700780497, "kl": 0.143310546875, "learning_rate": 3.887491861803085e-06, "loss": 0.1252, "reward": 1.970703125, "reward_std": 0.3808627650141716, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1846 }, { "clip_ratio": 0.0, "completion_length": 602.1953125, "epoch": 0.7388, "grad_norm": 0.6345568819256507, "kl": 0.152099609375, "learning_rate": 3.876447279266238e-06, "loss": 0.1447, "reward": 2.123046875, "reward_std": 0.3927247151732445, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1847 }, { "clip_ratio": 0.0, "completion_length": 681.3671875, "epoch": 0.7392, "grad_norm": 0.34801278267700647, "kl": 0.166259765625, "learning_rate": 3.86541463490876e-06, "loss": 0.1212, "reward": 2.06640625, "reward_std": 0.4126245081424713, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1848 }, { "clip_ratio": 0.0, "completion_length": 544.546875, "epoch": 0.7396, "grad_norm": 0.32014841974687774, "kl": 0.1668701171875, "learning_rate": 3.854393950239356e-06, "loss": 0.0607, "reward": 2.15625, "reward_std": 0.22170055657625198, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1849 }, { "clip_ratio": 0.0, "completion_length": 760.4296875, "epoch": 0.74, "grad_norm": 0.558538002856548, "kl": 0.149658203125, "learning_rate": 3.8433852467434175e-06, "loss": 0.0392, "reward": 2.001953125, "reward_std": 0.27803927659988403, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1850 }, { "clip_ratio": 0.0, "completion_length": 738.6796875, "epoch": 0.7404, "grad_norm": 0.5965227048299429, "kl": 0.14208984375, "learning_rate": 3.832388545882975e-06, "loss": 0.0258, "reward": 2.087890625, "reward_std": 0.2058747112751007, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1851 }, { "clip_ratio": 0.0, "completion_length": 634.4140625, "epoch": 0.7408, "grad_norm": 0.4525622558965772, "kl": 0.17041015625, "learning_rate": 3.821403869096658e-06, "loss": 0.1238, "reward": 1.921875, "reward_std": 0.4342353865504265, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1852 }, { "clip_ratio": 0.0, "completion_length": 701.7890625, "epoch": 0.7412, "grad_norm": 0.5491276622363731, "kl": 0.171875, "learning_rate": 3.810431237799657e-06, "loss": 0.09, "reward": 1.962890625, "reward_std": 0.4102044627070427, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1853 }, { "clip_ratio": 0.0, "completion_length": 682.390625, "epoch": 0.7416, "grad_norm": 0.47743293424628386, "kl": 0.1787109375, "learning_rate": 3.7994706733836738e-06, "loss": 0.1326, "reward": 2.16015625, "reward_std": 0.38111525774002075, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1854 }, { "clip_ratio": 0.0, "completion_length": 548.2734375, "epoch": 0.742, "grad_norm": 0.2805025851415809, "kl": 0.155029296875, "learning_rate": 3.7885221972168974e-06, "loss": 0.0265, "reward": 2.005859375, "reward_std": 0.08175812661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 1855 }, { "clip_ratio": 0.0, "completion_length": 541.3203125, "epoch": 0.7424, "grad_norm": 0.2338343500373135, "kl": 0.16748046875, "learning_rate": 3.7775858306439374e-06, "loss": 0.0736, "reward": 2.091796875, "reward_std": 0.13948732614517212, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1856 }, { "clip_ratio": 0.0, "completion_length": 664.2578125, "epoch": 0.7428, "grad_norm": 0.3885055889893681, "kl": 0.15283203125, "learning_rate": 3.766661594985801e-06, "loss": 0.1164, "reward": 2.248046875, "reward_std": 0.44596706330776215, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 1857 }, { "clip_ratio": 0.0, "completion_length": 653.921875, "epoch": 0.7432, "grad_norm": 0.4436931761244662, "kl": 0.1611328125, "learning_rate": 3.7557495115398446e-06, "loss": 0.0845, "reward": 2.119140625, "reward_std": 0.4369521662592888, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1858 }, { "clip_ratio": 0.0, "completion_length": 651.1171875, "epoch": 0.7436, "grad_norm": 0.49746323198895925, "kl": 0.185546875, "learning_rate": 3.7448496015797296e-06, "loss": 0.0615, "reward": 2.228515625, "reward_std": 0.33431944996118546, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 1859 }, { "clip_ratio": 0.0, "completion_length": 621.8984375, "epoch": 0.744, "grad_norm": 0.2835211838262705, "kl": 0.15380859375, "learning_rate": 3.7339618863553983e-06, "loss": 0.0577, "reward": 1.966796875, "reward_std": 0.16065485030412674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1860 }, { "clip_ratio": 0.0, "completion_length": 667.40625, "epoch": 0.7444, "grad_norm": 0.3332488557437649, "kl": 0.16552734375, "learning_rate": 3.723086387092997e-06, "loss": 0.1046, "reward": 1.91796875, "reward_std": 0.328125, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1861 }, { "clip_ratio": 0.0, "completion_length": 700.6953125, "epoch": 0.7448, "grad_norm": 0.2910016180178841, "kl": 0.149169921875, "learning_rate": 3.7122231249948747e-06, "loss": 0.0892, "reward": 2.0625, "reward_std": 0.31449709087610245, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1862 }, { "clip_ratio": 0.0, "completion_length": 726.9609375, "epoch": 0.7452, "grad_norm": 1.2983142916676824, "kl": 0.217529296875, "learning_rate": 3.7013721212395128e-06, "loss": 0.0607, "reward": 2.0078125, "reward_std": 0.22178399562835693, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1863 }, { "clip_ratio": 0.0, "completion_length": 707.1015625, "epoch": 0.7456, "grad_norm": 0.4131359563203512, "kl": 0.151611328125, "learning_rate": 3.6905333969815038e-06, "loss": 0.0713, "reward": 1.96484375, "reward_std": 0.28750747442245483, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1864 }, { "clip_ratio": 0.0, "completion_length": 746.234375, "epoch": 0.746, "grad_norm": 0.8394567477420177, "kl": 0.159912109375, "learning_rate": 3.679706973351491e-06, "loss": 0.0787, "reward": 1.921875, "reward_std": 0.35196130722761154, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 1865 }, { "clip_ratio": 0.0, "completion_length": 667.8203125, "epoch": 0.7464, "grad_norm": 0.21375119703313855, "kl": 0.15087890625, "learning_rate": 3.6688928714561444e-06, "loss": -0.0101, "reward": 2.28125, "reward_std": 0.18217839300632477, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 1866 }, { "clip_ratio": 0.0, "completion_length": 701.5390625, "epoch": 0.7468, "grad_norm": 0.8174460027481516, "kl": 0.17236328125, "learning_rate": 3.658091112378106e-06, "loss": 0.0946, "reward": 1.880859375, "reward_std": 0.3360717296600342, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 1867 }, { "clip_ratio": 0.0, "completion_length": 705.9296875, "epoch": 0.7472, "grad_norm": 0.4799420304285999, "kl": 0.173828125, "learning_rate": 3.6473017171759563e-06, "loss": 0.0843, "reward": 1.892578125, "reward_std": 0.34793953597545624, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.947265625, "step": 1868 }, { "clip_ratio": 0.0, "completion_length": 766.171875, "epoch": 0.7476, "grad_norm": 0.4106378655299726, "kl": 0.158935546875, "learning_rate": 3.636524706884181e-06, "loss": 0.1044, "reward": 2.1171875, "reward_std": 0.46281829476356506, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9375, "step": 1869 }, { "clip_ratio": 0.0, "completion_length": 687.8984375, "epoch": 0.748, "grad_norm": 0.3338323239738612, "kl": 0.15087890625, "learning_rate": 3.625760102513103e-06, "loss": 0.1347, "reward": 1.994140625, "reward_std": 0.3638884201645851, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.939453125, "step": 1870 }, { "clip_ratio": 0.0, "completion_length": 731.3515625, "epoch": 0.7484, "grad_norm": 0.47780801027658715, "kl": 0.1494140625, "learning_rate": 3.615007925048878e-06, "loss": 0.0869, "reward": 2.080078125, "reward_std": 0.20980776846408844, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1871 }, { "clip_ratio": 0.0, "completion_length": 697.8046875, "epoch": 0.7488, "grad_norm": 0.4321294635925993, "kl": 0.1630859375, "learning_rate": 3.604268195453421e-06, "loss": 0.1143, "reward": 1.91796875, "reward_std": 0.3515719026327133, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1872 }, { "clip_ratio": 0.0, "completion_length": 743.5234375, "epoch": 0.7492, "grad_norm": 0.5689198018809049, "kl": 0.190673828125, "learning_rate": 3.5935409346643835e-06, "loss": 0.0503, "reward": 2.05078125, "reward_std": 0.22930096089839935, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1873 }, { "clip_ratio": 0.0, "completion_length": 734.8671875, "epoch": 0.7496, "grad_norm": 0.2332233323967726, "kl": 0.1429443359375, "learning_rate": 3.582826163595119e-06, "loss": 0.0547, "reward": 1.931640625, "reward_std": 0.25771110504865646, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 1874 }, { "clip_ratio": 0.0, "completion_length": 682.9296875, "epoch": 0.75, "grad_norm": 0.27881346756811903, "kl": 0.1435546875, "learning_rate": 3.5721239031346067e-06, "loss": 0.1154, "reward": 1.919921875, "reward_std": 0.35886453092098236, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1875 }, { "clip_ratio": 0.0, "completion_length": 718.59375, "epoch": 0.7504, "grad_norm": 0.2397692212493823, "kl": 0.150634765625, "learning_rate": 3.5614341741474633e-06, "loss": 0.0526, "reward": 2.001953125, "reward_std": 0.29186275601387024, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1876 }, { "clip_ratio": 0.0, "completion_length": 716.5390625, "epoch": 0.7508, "grad_norm": 0.5462789940879914, "kl": 0.151123046875, "learning_rate": 3.5507569974738575e-06, "loss": 0.0766, "reward": 2.162109375, "reward_std": 0.32253529131412506, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1877 }, { "clip_ratio": 0.0, "completion_length": 773.8203125, "epoch": 0.7512, "grad_norm": 0.3235456923253998, "kl": 0.1376953125, "learning_rate": 3.540092393929494e-06, "loss": 0.0832, "reward": 1.91796875, "reward_std": 0.4789455384016037, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 1878 }, { "clip_ratio": 0.0, "completion_length": 707.8125, "epoch": 0.7516, "grad_norm": 0.30509439995685583, "kl": 0.1494140625, "learning_rate": 3.5294403843055604e-06, "loss": 0.0564, "reward": 2.076171875, "reward_std": 0.2644873261451721, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1879 }, { "clip_ratio": 0.0, "completion_length": 790.4375, "epoch": 0.752, "grad_norm": 19.34661139995742, "kl": 1.5892333984375, "learning_rate": 3.5188009893686916e-06, "loss": 0.2115, "reward": 1.734375, "reward_std": 0.5450569689273834, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.8984375, "step": 1880 }, { "clip_ratio": 0.0, "completion_length": 723.4375, "epoch": 0.7524, "grad_norm": 0.2869600045821545, "kl": 0.138916015625, "learning_rate": 3.50817422986094e-06, "loss": 0.0638, "reward": 1.958984375, "reward_std": 0.37440086901187897, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1881 }, { "clip_ratio": 0.0, "completion_length": 762.609375, "epoch": 0.7528, "grad_norm": 0.32055833760587654, "kl": 0.1572265625, "learning_rate": 3.4975601264997094e-06, "loss": 0.1285, "reward": 1.79296875, "reward_std": 0.5479542315006256, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.91796875, "step": 1882 }, { "clip_ratio": 0.0, "completion_length": 768.734375, "epoch": 0.7532, "grad_norm": 0.5211509673556437, "kl": 0.156982421875, "learning_rate": 3.4869586999777492e-06, "loss": 0.062, "reward": 1.853515625, "reward_std": 0.41532962024211884, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.931640625, "step": 1883 }, { "clip_ratio": 0.0, "completion_length": 759.0625, "epoch": 0.7536, "grad_norm": 0.8856623325292257, "kl": 0.20654296875, "learning_rate": 3.476369970963072e-06, "loss": 0.1641, "reward": 1.875, "reward_std": 0.5957323461771011, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.890625, "step": 1884 }, { "clip_ratio": 0.0, "completion_length": 673.953125, "epoch": 0.754, "grad_norm": 0.45258929348429877, "kl": 0.17041015625, "learning_rate": 3.4657939600989453e-06, "loss": 0.1689, "reward": 1.916015625, "reward_std": 0.5715730041265488, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.916015625, "step": 1885 }, { "clip_ratio": 0.0, "completion_length": 683.6328125, "epoch": 0.7544, "grad_norm": 4.648092507714148, "kl": 0.162353515625, "learning_rate": 3.455230688003852e-06, "loss": 0.1042, "reward": 2.0546875, "reward_std": 0.36033055931329727, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 1886 }, { "clip_ratio": 0.0, "completion_length": 696.375, "epoch": 0.7548, "grad_norm": 0.799663871654313, "kl": 0.211181640625, "learning_rate": 3.4446801752714287e-06, "loss": 0.1551, "reward": 1.873046875, "reward_std": 0.4610339254140854, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.951171875, "step": 1887 }, { "clip_ratio": 0.0, "completion_length": 726.6875, "epoch": 0.7552, "grad_norm": 0.41733792173077516, "kl": 0.151123046875, "learning_rate": 3.4341424424704373e-06, "loss": 0.1059, "reward": 1.919921875, "reward_std": 0.6173110380768776, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.912109375, "step": 1888 }, { "clip_ratio": 0.0, "completion_length": 713.3359375, "epoch": 0.7556, "grad_norm": 0.4869347728022482, "kl": 0.170166015625, "learning_rate": 3.4236175101447265e-06, "loss": 0.0997, "reward": 2.08984375, "reward_std": 0.4529557153582573, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.94921875, "step": 1889 }, { "clip_ratio": 0.0, "completion_length": 670.8203125, "epoch": 0.756, "grad_norm": 0.45609983627438305, "kl": 0.149169921875, "learning_rate": 3.4131053988131947e-06, "loss": 0.1708, "reward": 2.02734375, "reward_std": 0.5232008472084999, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 1890 }, { "clip_ratio": 0.0, "completion_length": 619.3359375, "epoch": 0.7564, "grad_norm": 0.4795879954441894, "kl": 0.17333984375, "learning_rate": 3.4026061289697397e-06, "loss": 0.1034, "reward": 2.04296875, "reward_std": 0.46856170892715454, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 1891 }, { "clip_ratio": 0.0, "completion_length": 657.0, "epoch": 0.7568, "grad_norm": 0.8772923119324123, "kl": 0.23828125, "learning_rate": 3.3921197210832235e-06, "loss": 0.1374, "reward": 1.8359375, "reward_std": 0.3930308595299721, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.9296875, "step": 1892 }, { "clip_ratio": 0.0, "completion_length": 653.1171875, "epoch": 0.7572, "grad_norm": 0.3679197025683153, "kl": 0.130859375, "learning_rate": 3.381646195597437e-06, "loss": 0.112, "reward": 2.17578125, "reward_std": 0.3025054410099983, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1893 }, { "clip_ratio": 0.0, "completion_length": 708.6328125, "epoch": 0.7576, "grad_norm": 0.7343292229544122, "kl": 0.173583984375, "learning_rate": 3.3711855729310482e-06, "loss": 0.1016, "reward": 1.93359375, "reward_std": 0.40081750601530075, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 1894 }, { "clip_ratio": 0.0, "completion_length": 691.4375, "epoch": 0.758, "grad_norm": 1.3148893848374996, "kl": 0.17333984375, "learning_rate": 3.360737873477584e-06, "loss": 0.1812, "reward": 1.892578125, "reward_std": 0.5819406285881996, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.8359375, "rewards/tag_count_reward": 0.884765625, "step": 1895 }, { "clip_ratio": 0.0, "completion_length": 678.7890625, "epoch": 0.7584, "grad_norm": 0.31460156452270294, "kl": 0.12939453125, "learning_rate": 3.3503031176053657e-06, "loss": 0.1507, "reward": 1.8828125, "reward_std": 0.4132187142968178, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1896 }, { "clip_ratio": 0.0, "completion_length": 663.328125, "epoch": 0.7588, "grad_norm": 0.4278790729792082, "kl": 0.1500244140625, "learning_rate": 3.3398813256574847e-06, "loss": 0.1403, "reward": 1.9453125, "reward_std": 0.5077133551239967, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9296875, "step": 1897 }, { "clip_ratio": 0.0, "completion_length": 754.640625, "epoch": 0.7592, "grad_norm": 0.6541859388454174, "kl": 0.1552734375, "learning_rate": 3.3294725179517573e-06, "loss": 0.0562, "reward": 2.001953125, "reward_std": 0.39080148935317993, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 1898 }, { "clip_ratio": 0.0, "completion_length": 753.484375, "epoch": 0.7596, "grad_norm": 0.3396439386372216, "kl": 0.148681640625, "learning_rate": 3.3190767147806825e-06, "loss": 0.0851, "reward": 1.9140625, "reward_std": 0.3493804410099983, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96875, "step": 1899 }, { "clip_ratio": 0.0, "completion_length": 724.71875, "epoch": 0.76, "grad_norm": 1.3299217297879282, "kl": 0.27587890625, "learning_rate": 3.308693936411421e-06, "loss": 0.0226, "reward": 1.994140625, "reward_std": 0.2658159136772156, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1900 }, { "clip_ratio": 0.0, "completion_length": 760.1640625, "epoch": 0.7604, "grad_norm": 0.19289201136726652, "kl": 0.125, "learning_rate": 3.2983242030857177e-06, "loss": 0.0505, "reward": 1.884765625, "reward_std": 0.2797933965921402, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1901 }, { "clip_ratio": 0.0, "completion_length": 758.3046875, "epoch": 0.7608, "grad_norm": 0.22470294148944894, "kl": 0.1494140625, "learning_rate": 3.287967535019908e-06, "loss": 0.0843, "reward": 1.8125, "reward_std": 0.3232303261756897, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.921875, "step": 1902 }, { "clip_ratio": 0.0, "completion_length": 655.1015625, "epoch": 0.7612, "grad_norm": 0.6711283522273054, "kl": 0.18994140625, "learning_rate": 3.2776239524048426e-06, "loss": 0.139, "reward": 1.94140625, "reward_std": 0.4282091557979584, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 1903 }, { "clip_ratio": 0.0, "completion_length": 694.9453125, "epoch": 0.7616, "grad_norm": 0.5095997251541465, "kl": 0.19921875, "learning_rate": 3.2672934754058615e-06, "loss": 0.1434, "reward": 1.875, "reward_std": 0.48730097711086273, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.921875, "step": 1904 }, { "clip_ratio": 0.0, "completion_length": 690.3984375, "epoch": 0.762, "grad_norm": 0.7548369294626153, "kl": 0.152099609375, "learning_rate": 3.2569761241627694e-06, "loss": 0.1482, "reward": 2.03125, "reward_std": 0.5173273608088493, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 1905 }, { "clip_ratio": 0.0, "completion_length": 654.1953125, "epoch": 0.7624, "grad_norm": 0.5178967298033165, "kl": 0.1416015625, "learning_rate": 3.2466719187897555e-06, "loss": 0.1622, "reward": 1.859375, "reward_std": 0.421518936753273, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1906 }, { "clip_ratio": 0.0, "completion_length": 686.5, "epoch": 0.7628, "grad_norm": 3.067833360131322, "kl": 0.18310546875, "learning_rate": 3.2363808793754082e-06, "loss": 0.055, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1907 }, { "clip_ratio": 0.0, "completion_length": 643.7578125, "epoch": 0.7632, "grad_norm": 0.3506112849921766, "kl": 0.18408203125, "learning_rate": 3.2261030259826287e-06, "loss": 0.2014, "reward": 1.921875, "reward_std": 0.5231668278574944, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9453125, "step": 1908 }, { "clip_ratio": 0.0, "completion_length": 649.3671875, "epoch": 0.7636, "grad_norm": 0.28526718846750787, "kl": 0.14306640625, "learning_rate": 3.2158383786486204e-06, "loss": 0.0628, "reward": 1.994140625, "reward_std": 0.20433919876813889, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1909 }, { "clip_ratio": 0.0, "completion_length": 658.8203125, "epoch": 0.764, "grad_norm": 0.3133611265534207, "kl": 0.151611328125, "learning_rate": 3.2055869573848374e-06, "loss": 0.1174, "reward": 1.90625, "reward_std": 0.39064352214336395, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1910 }, { "clip_ratio": 0.0, "completion_length": 715.3125, "epoch": 0.7644, "grad_norm": 0.6243021206592295, "kl": 0.142333984375, "learning_rate": 3.195348782176948e-06, "loss": 0.0506, "reward": 1.859375, "reward_std": 0.25719641894102097, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 1911 }, { "clip_ratio": 0.0, "completion_length": 554.5, "epoch": 0.7648, "grad_norm": 0.21001536606237398, "kl": 0.1533203125, "learning_rate": 3.1851238729848033e-06, "loss": 0.0493, "reward": 2.203125, "reward_std": 0.15973417460918427, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 1912 }, { "clip_ratio": 0.0, "completion_length": 588.4453125, "epoch": 0.7652, "grad_norm": 0.8674708890913266, "kl": 0.138671875, "learning_rate": 3.174912249742382e-06, "loss": 0.1785, "reward": 1.986328125, "reward_std": 0.45390585064888, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 1913 }, { "clip_ratio": 0.0, "completion_length": 631.3046875, "epoch": 0.7656, "grad_norm": 0.30200234772467904, "kl": 0.1474609375, "learning_rate": 3.164713932357776e-06, "loss": 0.0701, "reward": 1.951171875, "reward_std": 0.23119282722473145, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1914 }, { "clip_ratio": 0.0, "completion_length": 678.0078125, "epoch": 0.766, "grad_norm": 0.22565887419818448, "kl": 0.156005859375, "learning_rate": 3.1545289407131128e-06, "loss": 0.103, "reward": 1.93359375, "reward_std": 0.3099479079246521, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1915 }, { "clip_ratio": 0.0, "completion_length": 682.3671875, "epoch": 0.7664, "grad_norm": 0.5772663018095273, "kl": 0.16552734375, "learning_rate": 3.144357294664565e-06, "loss": 0.0767, "reward": 1.921875, "reward_std": 0.4478696435689926, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.953125, "step": 1916 }, { "clip_ratio": 0.0, "completion_length": 740.7578125, "epoch": 0.7668, "grad_norm": 0.5152548944271925, "kl": 0.157470703125, "learning_rate": 3.134199014042274e-06, "loss": 0.0433, "reward": 1.986328125, "reward_std": 0.34771182388067245, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 1917 }, { "clip_ratio": 0.0, "completion_length": 730.0, "epoch": 0.7672, "grad_norm": 0.27863131453941115, "kl": 0.16259765625, "learning_rate": 3.124054118650327e-06, "loss": 0.0665, "reward": 1.935546875, "reward_std": 0.19445691257715225, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1918 }, { "clip_ratio": 0.0, "completion_length": 690.328125, "epoch": 0.7676, "grad_norm": 0.2265639593260742, "kl": 0.128173828125, "learning_rate": 3.113922628266718e-06, "loss": 0.0852, "reward": 2.171875, "reward_std": 0.24492596089839935, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1919 }, { "clip_ratio": 0.0, "completion_length": 685.3359375, "epoch": 0.768, "grad_norm": 0.26610292463425034, "kl": 0.14013671875, "learning_rate": 3.103804562643302e-06, "loss": 0.0697, "reward": 2.029296875, "reward_std": 0.29719996452331543, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1920 }, { "clip_ratio": 0.0, "completion_length": 650.5546875, "epoch": 0.7684, "grad_norm": 0.6032898459978128, "kl": 0.177001953125, "learning_rate": 3.0936999415057712e-06, "loss": 0.1478, "reward": 1.91015625, "reward_std": 0.40286221355199814, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 1921 }, { "clip_ratio": 0.0, "completion_length": 652.75, "epoch": 0.7688, "grad_norm": 0.4477212021589246, "kl": 0.183349609375, "learning_rate": 3.0836087845536e-06, "loss": 0.0916, "reward": 1.98828125, "reward_std": 0.3800763785839081, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1922 }, { "clip_ratio": 0.0, "completion_length": 713.3671875, "epoch": 0.7692, "grad_norm": 0.32216485660544675, "kl": 0.149658203125, "learning_rate": 3.073531111460013e-06, "loss": 0.0661, "reward": 2.0, "reward_std": 0.2691895663738251, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 1923 }, { "clip_ratio": 0.0, "completion_length": 637.515625, "epoch": 0.7696, "grad_norm": 0.9114385476975242, "kl": 0.1328125, "learning_rate": 3.063466941871952e-06, "loss": 0.1483, "reward": 1.951171875, "reward_std": 0.3550765812397003, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1924 }, { "clip_ratio": 0.0, "completion_length": 685.0703125, "epoch": 0.77, "grad_norm": 0.2675311746432419, "kl": 0.148681640625, "learning_rate": 3.0534162954100264e-06, "loss": 0.0865, "reward": 1.908203125, "reward_std": 0.3384895622730255, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1925 }, { "clip_ratio": 0.0, "completion_length": 702.296875, "epoch": 0.7704, "grad_norm": 0.9061165714140803, "kl": 0.194091796875, "learning_rate": 3.043379191668492e-06, "loss": 0.046, "reward": 2.0703125, "reward_std": 0.14286844432353973, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1926 }, { "clip_ratio": 0.0, "completion_length": 745.1640625, "epoch": 0.7708, "grad_norm": 0.5241192845729936, "kl": 0.14599609375, "learning_rate": 3.033355650215193e-06, "loss": 0.0723, "reward": 2.154296875, "reward_std": 0.4754231125116348, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 1927 }, { "clip_ratio": 0.0, "completion_length": 600.8828125, "epoch": 0.7712, "grad_norm": 0.35350280370859805, "kl": 0.145751953125, "learning_rate": 3.023345690591537e-06, "loss": 0.1339, "reward": 2.03515625, "reward_std": 0.3594934865832329, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1928 }, { "clip_ratio": 0.0, "completion_length": 746.71875, "epoch": 0.7716, "grad_norm": 0.25467587261808117, "kl": 0.144775390625, "learning_rate": 3.013349332312451e-06, "loss": 0.0524, "reward": 2.048828125, "reward_std": 0.3189401477575302, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1929 }, { "clip_ratio": 0.0, "completion_length": 686.8984375, "epoch": 0.772, "grad_norm": 0.3179016139591085, "kl": 0.14306640625, "learning_rate": 3.003366594866345e-06, "loss": 0.0796, "reward": 1.96875, "reward_std": 0.2795058861374855, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1930 }, { "clip_ratio": 0.0, "completion_length": 695.1328125, "epoch": 0.7724, "grad_norm": 0.28886250274193226, "kl": 0.1475830078125, "learning_rate": 2.993397497715086e-06, "loss": 0.0717, "reward": 2.001953125, "reward_std": 0.4047117233276367, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 1931 }, { "clip_ratio": 0.0, "completion_length": 723.03125, "epoch": 0.7728, "grad_norm": 1.3184106460723852, "kl": 0.211669921875, "learning_rate": 2.983442060293926e-06, "loss": 0.1845, "reward": 1.806640625, "reward_std": 0.5746114701032639, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.861328125, "step": 1932 }, { "clip_ratio": 0.0, "completion_length": 652.7109375, "epoch": 0.7732, "grad_norm": 0.4165803780985707, "kl": 0.163818359375, "learning_rate": 2.9735003020115095e-06, "loss": 0.0793, "reward": 2.177734375, "reward_std": 0.4352499321103096, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1933 }, { "clip_ratio": 0.0, "completion_length": 761.1015625, "epoch": 0.7736, "grad_norm": 0.4325905473600973, "kl": 0.1527099609375, "learning_rate": 2.963572242249799e-06, "loss": 0.0368, "reward": 2.025390625, "reward_std": 0.23584209382534027, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 1934 }, { "clip_ratio": 0.0, "completion_length": 782.25, "epoch": 0.774, "grad_norm": 0.8073434537710785, "kl": 0.1494140625, "learning_rate": 2.953657900364053e-06, "loss": 0.0993, "reward": 1.904296875, "reward_std": 0.3974166065454483, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.904296875, "step": 1935 }, { "clip_ratio": 0.0, "completion_length": 603.828125, "epoch": 0.7744, "grad_norm": 0.614814576097469, "kl": 0.148681640625, "learning_rate": 2.9437572956827965e-06, "loss": 0.1664, "reward": 1.978515625, "reward_std": 0.4517209529876709, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1936 }, { "clip_ratio": 0.0, "completion_length": 685.71875, "epoch": 0.7748, "grad_norm": 0.2397116713953403, "kl": 0.1494140625, "learning_rate": 2.9338704475077527e-06, "loss": 0.1289, "reward": 1.896484375, "reward_std": 0.3505558520555496, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.951171875, "step": 1937 }, { "clip_ratio": 0.0, "completion_length": 809.1015625, "epoch": 0.7752, "grad_norm": 0.25665131949057973, "kl": 0.14208984375, "learning_rate": 2.9239973751138495e-06, "loss": 0.0527, "reward": 1.78515625, "reward_std": 0.38983044028282166, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.90234375, "step": 1938 }, { "clip_ratio": 0.0, "completion_length": 758.9375, "epoch": 0.7756, "grad_norm": 0.347996363647138, "kl": 0.1414794921875, "learning_rate": 2.9141380977491373e-06, "loss": 0.0973, "reward": 1.939453125, "reward_std": 0.3204467296600342, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 1939 }, { "clip_ratio": 0.0, "completion_length": 736.8125, "epoch": 0.776, "grad_norm": 0.3240211019049787, "kl": 0.131103515625, "learning_rate": 2.9042926346347932e-06, "loss": 0.1331, "reward": 1.89453125, "reward_std": 0.477643758058548, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.90234375, "step": 1940 }, { "clip_ratio": 0.0, "completion_length": 724.4375, "epoch": 0.7764, "grad_norm": 0.22098102127569944, "kl": 0.145263671875, "learning_rate": 2.8944610049650377e-06, "loss": 0.0388, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1941 }, { "clip_ratio": 0.0, "completion_length": 713.328125, "epoch": 0.7768, "grad_norm": 0.29520821447777124, "kl": 0.15771484375, "learning_rate": 2.884643227907147e-06, "loss": 0.0954, "reward": 2.16015625, "reward_std": 0.4496581181883812, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.94140625, "step": 1942 }, { "clip_ratio": 0.0, "completion_length": 716.40625, "epoch": 0.7772, "grad_norm": 0.3069004925467549, "kl": 0.15576171875, "learning_rate": 2.874839322601375e-06, "loss": 0.1166, "reward": 2.060546875, "reward_std": 0.45437487214803696, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1943 }, { "clip_ratio": 0.0, "completion_length": 778.890625, "epoch": 0.7776, "grad_norm": 0.3180583776369634, "kl": 0.167236328125, "learning_rate": 2.8650493081609344e-06, "loss": 0.1143, "reward": 1.904296875, "reward_std": 0.5455500185489655, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.912109375, "step": 1944 }, { "clip_ratio": 0.0, "completion_length": 747.8984375, "epoch": 0.778, "grad_norm": 0.20790997656721344, "kl": 0.1268310546875, "learning_rate": 2.855273203671969e-06, "loss": 0.0526, "reward": 2.005859375, "reward_std": 0.31853410601615906, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 1945 }, { "clip_ratio": 0.0, "completion_length": 717.890625, "epoch": 0.7784, "grad_norm": 0.347287136481452, "kl": 0.14306640625, "learning_rate": 2.8455110281934804e-06, "loss": 0.0953, "reward": 2.09765625, "reward_std": 0.3744274005293846, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1946 }, { "clip_ratio": 0.0, "completion_length": 710.6640625, "epoch": 0.7788, "grad_norm": 0.6359703390415595, "kl": 0.1534423828125, "learning_rate": 2.8357628007573412e-06, "loss": 0.1406, "reward": 1.974609375, "reward_std": 0.3828125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 1947 }, { "clip_ratio": 0.0, "completion_length": 774.296875, "epoch": 0.7792, "grad_norm": 0.5795371821033382, "kl": 0.147216796875, "learning_rate": 2.8260285403682153e-06, "loss": 0.0454, "reward": 1.8984375, "reward_std": 0.3606208860874176, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.953125, "step": 1948 }, { "clip_ratio": 0.0, "completion_length": 740.4609375, "epoch": 0.7796, "grad_norm": 0.24927418780564647, "kl": 0.140625, "learning_rate": 2.816308266003541e-06, "loss": 0.1319, "reward": 1.830078125, "reward_std": 0.4292280897498131, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.931640625, "step": 1949 }, { "clip_ratio": 0.0, "completion_length": 577.046875, "epoch": 0.78, "grad_norm": 0.1491355661637049, "kl": 0.13037109375, "learning_rate": 2.8066019966134907e-06, "loss": 0.0053, "reward": 2.1484375, "reward_std": 0.050389111042022705, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 1950 }, { "clip_ratio": 0.0, "completion_length": 775.5859375, "epoch": 0.7804, "grad_norm": 1.0951279402541017, "kl": 0.1298828125, "learning_rate": 2.796909751120931e-06, "loss": 0.0657, "reward": 1.892578125, "reward_std": 0.23778468370437622, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.947265625, "step": 1951 }, { "clip_ratio": 0.0, "completion_length": 732.0546875, "epoch": 0.7808, "grad_norm": 0.502633800268772, "kl": 0.134521484375, "learning_rate": 2.7872315484213954e-06, "loss": 0.0443, "reward": 2.1328125, "reward_std": 0.22284550219774246, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 1952 }, { "clip_ratio": 0.0, "completion_length": 701.140625, "epoch": 0.7812, "grad_norm": 0.19953442746999117, "kl": 0.137451171875, "learning_rate": 2.7775674073830337e-06, "loss": 0.0444, "reward": 1.9453125, "reward_std": 0.23344752937555313, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 1953 }, { "clip_ratio": 0.0, "completion_length": 659.2578125, "epoch": 0.7816, "grad_norm": 0.29353505857366213, "kl": 0.14453125, "learning_rate": 2.7679173468465813e-06, "loss": 0.0669, "reward": 1.953125, "reward_std": 0.21534235030412674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1954 }, { "clip_ratio": 0.0, "completion_length": 633.6171875, "epoch": 0.782, "grad_norm": 0.14813843080522238, "kl": 0.129638671875, "learning_rate": 2.7582813856253276e-06, "loss": 0.08, "reward": 1.904296875, "reward_std": 0.22723282873630524, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 1955 }, { "clip_ratio": 0.0, "completion_length": 545.9921875, "epoch": 0.7824, "grad_norm": 0.35102188690481984, "kl": 0.148193359375, "learning_rate": 2.7486595425050667e-06, "loss": 0.1396, "reward": 2.015625, "reward_std": 0.3819151297211647, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 1956 }, { "clip_ratio": 0.0, "completion_length": 699.7421875, "epoch": 0.7828, "grad_norm": 0.9069066793577013, "kl": 0.144287109375, "learning_rate": 2.739051836244081e-06, "loss": 0.0871, "reward": 1.96484375, "reward_std": 0.4288794547319412, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.95703125, "step": 1957 }, { "clip_ratio": 0.0, "completion_length": 685.65625, "epoch": 0.7832, "grad_norm": 0.2456150141566812, "kl": 0.12646484375, "learning_rate": 2.7294582855730835e-06, "loss": 0.1435, "reward": 1.951171875, "reward_std": 0.3359003961086273, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 1958 }, { "clip_ratio": 0.0, "completion_length": 814.78125, "epoch": 0.7836, "grad_norm": 0.3055593926811733, "kl": 0.137451171875, "learning_rate": 2.7198789091951903e-06, "loss": 0.0631, "reward": 1.80078125, "reward_std": 0.4373554661870003, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.91796875, "step": 1959 }, { "clip_ratio": 0.0, "completion_length": 707.5546875, "epoch": 0.784, "grad_norm": 0.3681435079144517, "kl": 0.130859375, "learning_rate": 2.7103137257858867e-06, "loss": 0.0497, "reward": 2.017578125, "reward_std": 0.22101997584104538, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 1960 }, { "clip_ratio": 0.0, "completion_length": 731.3671875, "epoch": 0.7844, "grad_norm": 0.28741817987980334, "kl": 0.14697265625, "learning_rate": 2.7007627539929847e-06, "loss": 0.096, "reward": 1.86328125, "reward_std": 0.4494984671473503, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 1961 }, { "clip_ratio": 0.0, "completion_length": 656.484375, "epoch": 0.7848, "grad_norm": 0.5376925193497815, "kl": 0.187744140625, "learning_rate": 2.6912260124366007e-06, "loss": 0.0723, "reward": 2.138671875, "reward_std": 0.2997613772749901, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1962 }, { "clip_ratio": 0.0, "completion_length": 625.4453125, "epoch": 0.7852, "grad_norm": 0.47963604601232135, "kl": 0.158203125, "learning_rate": 2.6817035197090892e-06, "loss": 0.0974, "reward": 2.11328125, "reward_std": 0.36809954792261124, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 1963 }, { "clip_ratio": 0.0, "completion_length": 661.15625, "epoch": 0.7856, "grad_norm": 0.14835485874290866, "kl": 0.1202392578125, "learning_rate": 2.672195294375045e-06, "loss": 0.0197, "reward": 2.181640625, "reward_std": 0.11873093992471695, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.994140625, "step": 1964 }, { "clip_ratio": 0.0, "completion_length": 639.5859375, "epoch": 0.786, "grad_norm": 0.3294880132589334, "kl": 0.156982421875, "learning_rate": 2.6627013549712355e-06, "loss": 0.1065, "reward": 2.037109375, "reward_std": 0.3392053321003914, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1965 }, { "clip_ratio": 0.0, "completion_length": 649.0234375, "epoch": 0.7864, "grad_norm": 0.5714690029466435, "kl": 0.145751953125, "learning_rate": 2.6532217200065856e-06, "loss": 0.1822, "reward": 1.986328125, "reward_std": 0.4990316182374954, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.931640625, "step": 1966 }, { "clip_ratio": 0.0, "completion_length": 699.6171875, "epoch": 0.7868, "grad_norm": 0.36440847466122717, "kl": 0.1427001953125, "learning_rate": 2.643756407962127e-06, "loss": 0.02, "reward": 2.169921875, "reward_std": 0.22315485030412674, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1967 }, { "clip_ratio": 0.0, "completion_length": 659.5390625, "epoch": 0.7872, "grad_norm": 0.2936534509654772, "kl": 0.15478515625, "learning_rate": 2.634305437290968e-06, "loss": 0.0838, "reward": 2.009765625, "reward_std": 0.35594895482063293, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 1968 }, { "clip_ratio": 0.0, "completion_length": 682.5, "epoch": 0.7876, "grad_norm": 0.3142124897773653, "kl": 0.1337890625, "learning_rate": 2.624868826418262e-06, "loss": 0.1604, "reward": 1.88671875, "reward_std": 0.4751351475715637, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 1969 }, { "clip_ratio": 0.0, "completion_length": 726.40625, "epoch": 0.788, "grad_norm": 0.28934577259578, "kl": 0.14306640625, "learning_rate": 2.615446593741161e-06, "loss": 0.1212, "reward": 1.9375, "reward_std": 0.4290183037519455, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.921875, "step": 1970 }, { "clip_ratio": 0.0, "completion_length": 764.4609375, "epoch": 0.7884, "grad_norm": 0.2408015629136546, "kl": 0.1474609375, "learning_rate": 2.6060387576287983e-06, "loss": 0.0726, "reward": 1.958984375, "reward_std": 0.4145585149526596, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.951171875, "step": 1971 }, { "clip_ratio": 0.0, "completion_length": 663.0625, "epoch": 0.7888, "grad_norm": 0.1631321254913252, "kl": 0.148681640625, "learning_rate": 2.596645336422219e-06, "loss": 0.0284, "reward": 2.119140625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 1972 }, { "clip_ratio": 0.0, "completion_length": 687.9609375, "epoch": 0.7892, "grad_norm": 0.3648732804860769, "kl": 0.189697265625, "learning_rate": 2.5872663484343887e-06, "loss": 0.0954, "reward": 1.88671875, "reward_std": 0.32748541235923767, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1973 }, { "clip_ratio": 0.0, "completion_length": 675.4765625, "epoch": 0.7896, "grad_norm": 0.5176674247175533, "kl": 0.16259765625, "learning_rate": 2.577901811950121e-06, "loss": 0.0787, "reward": 2.009765625, "reward_std": 0.40657516568899155, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 1974 }, { "clip_ratio": 0.0, "completion_length": 635.9375, "epoch": 0.79, "grad_norm": 407.4895458906689, "kl": 90.601318359375, "learning_rate": 2.5685517452260566e-06, "loss": 4.9085, "reward": 2.05078125, "reward_std": 0.330462783575058, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 1975 }, { "clip_ratio": 0.0, "completion_length": 630.6875, "epoch": 0.7904, "grad_norm": 1.083163117391579, "kl": 0.15966796875, "learning_rate": 2.5592161664906366e-06, "loss": 0.0841, "reward": 2.00390625, "reward_std": 0.3218083903193474, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 1976 }, { "clip_ratio": 0.0, "completion_length": 701.3984375, "epoch": 0.7908, "grad_norm": 0.19622593865260554, "kl": 0.142578125, "learning_rate": 2.549895093944039e-06, "loss": 0.0568, "reward": 2.0078125, "reward_std": 0.25821826606988907, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1977 }, { "clip_ratio": 0.0, "completion_length": 663.203125, "epoch": 0.7912, "grad_norm": 0.6009704860519237, "kl": 0.16357421875, "learning_rate": 2.5405885457581793e-06, "loss": 0.1324, "reward": 2.177734375, "reward_std": 0.5136195942759514, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.951171875, "step": 1978 }, { "clip_ratio": 0.0, "completion_length": 729.7109375, "epoch": 0.7916, "grad_norm": 0.2335048971044847, "kl": 0.139404296875, "learning_rate": 2.5312965400766475e-06, "loss": 0.0325, "reward": 2.05859375, "reward_std": 0.16946755349636078, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 1979 }, { "clip_ratio": 0.0, "completion_length": 626.328125, "epoch": 0.792, "grad_norm": 1.149661343664387, "kl": 0.16552734375, "learning_rate": 2.522019095014683e-06, "loss": 0.0503, "reward": 2.087890625, "reward_std": 0.23652399331331253, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 1980 }, { "clip_ratio": 0.0, "completion_length": 685.2421875, "epoch": 0.7924, "grad_norm": 0.22169380257309193, "kl": 0.133544921875, "learning_rate": 2.512756228659141e-06, "loss": 0.0329, "reward": 1.990234375, "reward_std": 0.2457016110420227, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 1981 }, { "clip_ratio": 0.0, "completion_length": 727.484375, "epoch": 0.7928, "grad_norm": 0.27410797287601973, "kl": 0.1378173828125, "learning_rate": 2.5035079590684496e-06, "loss": 0.0566, "reward": 1.982421875, "reward_std": 0.3092750012874603, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 1982 }, { "clip_ratio": 0.0, "completion_length": 697.3359375, "epoch": 0.7932, "grad_norm": 0.4936875531295832, "kl": 0.159423828125, "learning_rate": 2.494274304272589e-06, "loss": 0.084, "reward": 1.95703125, "reward_std": 0.3495491147041321, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96484375, "step": 1983 }, { "clip_ratio": 0.0, "completion_length": 726.8515625, "epoch": 0.7936, "grad_norm": 0.1979610329164493, "kl": 0.135498046875, "learning_rate": 2.48505528227304e-06, "loss": 0.0581, "reward": 1.970703125, "reward_std": 0.24106165021657944, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1984 }, { "clip_ratio": 0.0, "completion_length": 693.890625, "epoch": 0.794, "grad_norm": 0.2751802710848594, "kl": 0.135498046875, "learning_rate": 2.4758509110427576e-06, "loss": 0.0461, "reward": 2.078125, "reward_std": 0.3345356658101082, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 1985 }, { "clip_ratio": 0.0, "completion_length": 724.5234375, "epoch": 0.7944, "grad_norm": 1.603398725981801, "kl": 0.28466796875, "learning_rate": 2.4666612085261344e-06, "loss": 0.0834, "reward": 1.84375, "reward_std": 0.33599888533353806, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 1986 }, { "clip_ratio": 0.0, "completion_length": 664.40625, "epoch": 0.7948, "grad_norm": 0.37788878855440533, "kl": 0.14306640625, "learning_rate": 2.4574861926389615e-06, "loss": 0.0675, "reward": 2.064453125, "reward_std": 0.38670212775468826, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 1987 }, { "clip_ratio": 0.0, "completion_length": 648.3125, "epoch": 0.7952, "grad_norm": 0.2927208667206982, "kl": 0.14208984375, "learning_rate": 2.4483258812684096e-06, "loss": 0.1116, "reward": 1.93359375, "reward_std": 0.36415470391511917, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96484375, "step": 1988 }, { "clip_ratio": 0.0, "completion_length": 704.21875, "epoch": 0.7956, "grad_norm": 0.2402166247788019, "kl": 0.1256103515625, "learning_rate": 2.4391802922729703e-06, "loss": 0.0586, "reward": 1.998046875, "reward_std": 0.39049164205789566, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 1989 }, { "clip_ratio": 0.0, "completion_length": 627.4140625, "epoch": 0.796, "grad_norm": 0.4040683836238234, "kl": 0.166015625, "learning_rate": 2.4300494434824373e-06, "loss": 0.0719, "reward": 2.333984375, "reward_std": 0.3166820704936981, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 1990 }, { "clip_ratio": 0.0, "completion_length": 746.0078125, "epoch": 0.7964, "grad_norm": 0.29005760040512274, "kl": 0.16162109375, "learning_rate": 2.420933352697865e-06, "loss": 0.1236, "reward": 1.71484375, "reward_std": 0.4133491963148117, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.828125, "rewards/tag_count_reward": 0.87890625, "step": 1991 }, { "clip_ratio": 0.0, "completion_length": 640.4375, "epoch": 0.7968, "grad_norm": 0.280109484367419, "kl": 0.142822265625, "learning_rate": 2.411832037691545e-06, "loss": 0.1462, "reward": 2.16796875, "reward_std": 0.44468729943037033, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.95703125, "step": 1992 }, { "clip_ratio": 0.0, "completion_length": 722.5234375, "epoch": 0.7972, "grad_norm": 0.5594090416315308, "kl": 0.133544921875, "learning_rate": 2.4027455162069567e-06, "loss": 0.0604, "reward": 2.11328125, "reward_std": 0.2535141110420227, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 1993 }, { "clip_ratio": 0.0, "completion_length": 786.9765625, "epoch": 0.7976, "grad_norm": 0.18769009141352688, "kl": 0.1357421875, "learning_rate": 2.3936738059587284e-06, "loss": 0.0309, "reward": 2.068359375, "reward_std": 0.27799221873283386, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1994 }, { "clip_ratio": 0.0, "completion_length": 701.828125, "epoch": 0.798, "grad_norm": 0.4639811046390169, "kl": 0.151611328125, "learning_rate": 2.3846169246326345e-06, "loss": 0.0281, "reward": 1.943359375, "reward_std": 0.21318094432353973, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 1995 }, { "clip_ratio": 0.0, "completion_length": 683.4375, "epoch": 0.7984, "grad_norm": 0.29845234437492374, "kl": 0.156005859375, "learning_rate": 2.37557488988552e-06, "loss": 0.0629, "reward": 2.154296875, "reward_std": 0.2501547113060951, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 1996 }, { "clip_ratio": 0.0, "completion_length": 741.625, "epoch": 0.7988, "grad_norm": 0.22283955150054294, "kl": 0.125244140625, "learning_rate": 2.3665477193453037e-06, "loss": 0.0404, "reward": 2.16015625, "reward_std": 0.2629641965031624, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 1997 }, { "clip_ratio": 0.0, "completion_length": 717.5703125, "epoch": 0.7992, "grad_norm": 0.2631313920710909, "kl": 0.130859375, "learning_rate": 2.35753543061091e-06, "loss": 0.0682, "reward": 1.94140625, "reward_std": 0.2681770622730255, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 1998 }, { "clip_ratio": 0.0, "completion_length": 672.859375, "epoch": 0.7996, "grad_norm": 0.22606961173673587, "kl": 0.150634765625, "learning_rate": 2.3485380412522586e-06, "loss": 0.0943, "reward": 2.0703125, "reward_std": 0.21875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 1999 }, { "clip_ratio": 0.0, "completion_length": 728.2734375, "epoch": 0.8, "grad_norm": 0.41363115348832497, "kl": 0.12548828125, "learning_rate": 2.339555568810221e-06, "loss": 0.1082, "reward": 1.841796875, "reward_std": 0.4424043670296669, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.935546875, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 732.46875, "epoch": 0.8004, "grad_norm": 1.0278181230721157, "kl": 0.1258544921875, "learning_rate": 2.3305880307965834e-06, "loss": 0.0505, "reward": 1.9296875, "reward_std": 0.23917050659656525, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2001 }, { "clip_ratio": 0.0, "completion_length": 707.1875, "epoch": 0.8008, "grad_norm": 0.33942433445291625, "kl": 0.13427734375, "learning_rate": 2.321635444694028e-06, "loss": 0.0514, "reward": 2.046875, "reward_std": 0.3418140709400177, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 2002 }, { "clip_ratio": 0.0, "completion_length": 716.0703125, "epoch": 0.8012, "grad_norm": 0.31197334592110154, "kl": 0.148193359375, "learning_rate": 2.3126978279560687e-06, "loss": 0.0803, "reward": 2.0390625, "reward_std": 0.34375, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2003 }, { "clip_ratio": 0.0, "completion_length": 585.421875, "epoch": 0.8016, "grad_norm": 0.34024167039857134, "kl": 0.1484375, "learning_rate": 2.3037751980070557e-06, "loss": 0.0685, "reward": 2.11328125, "reward_std": 0.3963082879781723, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2004 }, { "clip_ratio": 0.0, "completion_length": 675.4296875, "epoch": 0.802, "grad_norm": 0.17186353437274, "kl": 0.129638671875, "learning_rate": 2.2948675722421086e-06, "loss": 0.0277, "reward": 2.09765625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2005 }, { "clip_ratio": 0.0, "completion_length": 722.5703125, "epoch": 0.8024, "grad_norm": 0.23474436603332496, "kl": 0.127685546875, "learning_rate": 2.2859749680270983e-06, "loss": 0.0926, "reward": 1.884765625, "reward_std": 0.31509189307689667, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 2006 }, { "clip_ratio": 0.0, "completion_length": 660.34375, "epoch": 0.8028, "grad_norm": 0.15267888639031663, "kl": 0.1356201171875, "learning_rate": 2.277097402698619e-06, "loss": 0.0466, "reward": 1.974609375, "reward_std": 0.20675812661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2007 }, { "clip_ratio": 0.0, "completion_length": 676.796875, "epoch": 0.8032, "grad_norm": 0.3588921592753144, "kl": 0.12646484375, "learning_rate": 2.2682348935639274e-06, "loss": 0.0233, "reward": 2.05859375, "reward_std": 0.27259334921836853, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2008 }, { "clip_ratio": 0.0, "completion_length": 672.7109375, "epoch": 0.8036, "grad_norm": 0.11615150129981996, "kl": 0.141845703125, "learning_rate": 2.259387457900948e-06, "loss": 0.0378, "reward": 1.94921875, "reward_std": 0.13555096089839935, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2009 }, { "clip_ratio": 0.0, "completion_length": 680.140625, "epoch": 0.804, "grad_norm": 0.25535797695058143, "kl": 0.1336669921875, "learning_rate": 2.2505551129582047e-06, "loss": 0.0789, "reward": 2.20703125, "reward_std": 0.31886589527130127, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2010 }, { "clip_ratio": 0.0, "completion_length": 705.03125, "epoch": 0.8044, "grad_norm": 0.5426994279601413, "kl": 0.1236572265625, "learning_rate": 2.241737875954808e-06, "loss": 0.0952, "reward": 1.97265625, "reward_std": 0.39581964164972305, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2011 }, { "clip_ratio": 0.0, "completion_length": 733.6875, "epoch": 0.8048, "grad_norm": 0.16115601161825813, "kl": 0.128662109375, "learning_rate": 2.2329357640804118e-06, "loss": 0.0634, "reward": 1.83203125, "reward_std": 0.27351538836956024, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.91796875, "step": 2012 }, { "clip_ratio": 0.0, "completion_length": 699.8046875, "epoch": 0.8052, "grad_norm": 0.46687266643026676, "kl": 0.144775390625, "learning_rate": 2.22414879449518e-06, "loss": 0.1185, "reward": 1.927734375, "reward_std": 0.25518687814474106, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2013 }, { "clip_ratio": 0.0, "completion_length": 692.59375, "epoch": 0.8056, "grad_norm": 0.4645340459158023, "kl": 0.149658203125, "learning_rate": 2.215376984329767e-06, "loss": 0.0553, "reward": 2.02734375, "reward_std": 0.3768854960799217, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2014 }, { "clip_ratio": 0.0, "completion_length": 713.7890625, "epoch": 0.806, "grad_norm": 0.2802243086724675, "kl": 0.1328125, "learning_rate": 2.206620350685257e-06, "loss": 0.0509, "reward": 2.060546875, "reward_std": 0.3889497220516205, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2015 }, { "clip_ratio": 0.0, "completion_length": 679.2109375, "epoch": 0.8064, "grad_norm": 0.22443155686301416, "kl": 0.1484375, "learning_rate": 2.1978789106331666e-06, "loss": 0.097, "reward": 2.080078125, "reward_std": 0.3973563238978386, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2016 }, { "clip_ratio": 0.0, "completion_length": 680.765625, "epoch": 0.8068, "grad_norm": 0.2899735794061666, "kl": 0.147216796875, "learning_rate": 2.1891526812153674e-06, "loss": 0.0534, "reward": 2.064453125, "reward_std": 0.3651297837495804, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.947265625, "step": 2017 }, { "clip_ratio": 0.0, "completion_length": 654.4296875, "epoch": 0.8072, "grad_norm": 0.3862798332514061, "kl": 0.143798828125, "learning_rate": 2.1804416794441e-06, "loss": 0.0535, "reward": 2.1640625, "reward_std": 0.31217923015356064, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2018 }, { "clip_ratio": 0.0, "completion_length": 585.703125, "epoch": 0.8076, "grad_norm": 0.38240190547013453, "kl": 0.13720703125, "learning_rate": 2.171745922301903e-06, "loss": 0.0866, "reward": 2.18359375, "reward_std": 0.4470468610525131, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2019 }, { "clip_ratio": 0.0, "completion_length": 648.7109375, "epoch": 0.808, "grad_norm": 0.2386892771261777, "kl": 0.1275634765625, "learning_rate": 2.163065426741603e-06, "loss": 0.0521, "reward": 1.9609375, "reward_std": 0.20199526846408844, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2020 }, { "clip_ratio": 0.0, "completion_length": 705.234375, "epoch": 0.8084, "grad_norm": 0.22582723866132737, "kl": 0.1302490234375, "learning_rate": 2.154400209686268e-06, "loss": 0.0238, "reward": 2.04296875, "reward_std": 0.16831252723932266, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98828125, "step": 2021 }, { "clip_ratio": 0.0, "completion_length": 733.8125, "epoch": 0.8088, "grad_norm": 0.21924160278619173, "kl": 0.124755859375, "learning_rate": 2.1457502880291815e-06, "loss": 0.0871, "reward": 1.9609375, "reward_std": 0.38823191821575165, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2022 }, { "clip_ratio": 0.0, "completion_length": 680.359375, "epoch": 0.8092, "grad_norm": 0.4027434913218567, "kl": 0.1478271484375, "learning_rate": 2.1371156786338108e-06, "loss": 0.0248, "reward": 2.068359375, "reward_std": 0.2742358520627022, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2023 }, { "clip_ratio": 0.0, "completion_length": 644.3125, "epoch": 0.8096, "grad_norm": 0.2728802473762859, "kl": 0.1328125, "learning_rate": 2.128496398333768e-06, "loss": 0.0352, "reward": 1.974609375, "reward_std": 0.20675812661647797, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2024 }, { "clip_ratio": 0.0, "completion_length": 661.3828125, "epoch": 0.81, "grad_norm": 0.290517657468777, "kl": 0.144287109375, "learning_rate": 2.119892463932781e-06, "loss": 0.0081, "reward": 2.017578125, "reward_std": 0.15988312661647797, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2025 }, { "clip_ratio": 0.0, "completion_length": 705.703125, "epoch": 0.8104, "grad_norm": 0.2528444231984569, "kl": 0.135986328125, "learning_rate": 2.1113038922046603e-06, "loss": 0.0743, "reward": 1.8828125, "reward_std": 0.31698688864707947, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 2026 }, { "clip_ratio": 0.0, "completion_length": 737.2109375, "epoch": 0.8108, "grad_norm": 0.48190885359074664, "kl": 0.1396484375, "learning_rate": 2.102730699893263e-06, "loss": 0.0603, "reward": 1.9609375, "reward_std": 0.36879370361566544, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 2027 }, { "clip_ratio": 0.0, "completion_length": 652.3046875, "epoch": 0.8112, "grad_norm": 0.26951266838781174, "kl": 0.1181640625, "learning_rate": 2.09417290371247e-06, "loss": 0.0481, "reward": 2.193359375, "reward_std": 0.2819499894976616, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2028 }, { "clip_ratio": 0.0, "completion_length": 662.5, "epoch": 0.8116, "grad_norm": 0.7867591347214307, "kl": 0.135498046875, "learning_rate": 2.0856305203461436e-06, "loss": 0.096, "reward": 2.25, "reward_std": 0.2929438352584839, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2029 }, { "clip_ratio": 0.0, "completion_length": 681.0390625, "epoch": 0.812, "grad_norm": 0.35441920990565096, "kl": 0.123291015625, "learning_rate": 2.0771035664480944e-06, "loss": 0.1287, "reward": 1.9140625, "reward_std": 0.37114593386650085, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9453125, "step": 2030 }, { "clip_ratio": 0.0, "completion_length": 674.4765625, "epoch": 0.8124, "grad_norm": 0.31053118155304704, "kl": 0.150634765625, "learning_rate": 2.0685920586420562e-06, "loss": 0.0564, "reward": 2.12109375, "reward_std": 0.3299161493778229, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2031 }, { "clip_ratio": 0.0, "completion_length": 718.84375, "epoch": 0.8128, "grad_norm": 0.29195920077021115, "kl": 0.135498046875, "learning_rate": 2.0600960135216463e-06, "loss": 0.0872, "reward": 1.796875, "reward_std": 0.37189067900180817, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.84375, "rewards/tag_count_reward": 0.8828125, "step": 2032 }, { "clip_ratio": 0.0, "completion_length": 670.625, "epoch": 0.8132, "grad_norm": 0.251596540913049, "kl": 0.12744140625, "learning_rate": 2.051615447650347e-06, "loss": 0.0351, "reward": 2.115234375, "reward_std": 0.2987433969974518, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2033 }, { "clip_ratio": 0.0, "completion_length": 645.796875, "epoch": 0.8136, "grad_norm": 0.2540126097298561, "kl": 0.13818359375, "learning_rate": 2.0431503775614457e-06, "loss": 0.0993, "reward": 2.181640625, "reward_std": 0.23877985030412674, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2034 }, { "clip_ratio": 0.0, "completion_length": 722.359375, "epoch": 0.814, "grad_norm": 0.2939366439706583, "kl": 0.1259765625, "learning_rate": 2.0347008197580376e-06, "loss": 0.0574, "reward": 1.98828125, "reward_std": 0.32471735030412674, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2035 }, { "clip_ratio": 0.0, "completion_length": 649.5078125, "epoch": 0.8144, "grad_norm": 0.14385890295554948, "kl": 0.1177978515625, "learning_rate": 2.026266790712965e-06, "loss": 0.0613, "reward": 2.083984375, "reward_std": 0.12940484285354614, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2036 }, { "clip_ratio": 0.0, "completion_length": 678.0, "epoch": 0.8148, "grad_norm": 0.177198273881463, "kl": 0.12841796875, "learning_rate": 2.017848306868797e-06, "loss": 0.05, "reward": 1.974609375, "reward_std": 0.1353645622730255, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 2037 }, { "clip_ratio": 0.0, "completion_length": 643.2734375, "epoch": 0.8152, "grad_norm": 0.45145805306358666, "kl": 0.1409912109375, "learning_rate": 2.009445384637805e-06, "loss": 0.0414, "reward": 2.205078125, "reward_std": 0.3821316659450531, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2038 }, { "clip_ratio": 0.0, "completion_length": 697.8359375, "epoch": 0.8156, "grad_norm": 0.2554687173677781, "kl": 0.145263671875, "learning_rate": 2.0010580404019066e-06, "loss": 0.0348, "reward": 1.966796875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2039 }, { "clip_ratio": 0.0, "completion_length": 739.3359375, "epoch": 0.816, "grad_norm": 0.2451746479499207, "kl": 0.1383056640625, "learning_rate": 1.9926862905126663e-06, "loss": 0.0346, "reward": 1.970703125, "reward_std": 0.3078790307044983, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2040 }, { "clip_ratio": 0.0, "completion_length": 710.6484375, "epoch": 0.8164, "grad_norm": 0.3897897091372338, "kl": 0.1431884765625, "learning_rate": 1.984330151291233e-06, "loss": 0.097, "reward": 1.966796875, "reward_std": 0.42052000761032104, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 2041 }, { "clip_ratio": 0.0, "completion_length": 670.7265625, "epoch": 0.8168, "grad_norm": 0.255668706888063, "kl": 0.1357421875, "learning_rate": 1.9759896390283362e-06, "loss": 0.0445, "reward": 1.974609375, "reward_std": 0.16969294100999832, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2042 }, { "clip_ratio": 0.0, "completion_length": 695.625, "epoch": 0.8172, "grad_norm": 0.23916368770563481, "kl": 0.1484375, "learning_rate": 1.9676647699842246e-06, "loss": 0.0555, "reward": 2.125, "reward_std": 0.23063847422599792, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2043 }, { "clip_ratio": 0.0, "completion_length": 713.796875, "epoch": 0.8176, "grad_norm": 0.5057246524854465, "kl": 0.13525390625, "learning_rate": 1.959355560388654e-06, "loss": 0.1135, "reward": 1.923828125, "reward_std": 0.3893493711948395, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.916015625, "step": 2044 }, { "clip_ratio": 0.0, "completion_length": 696.359375, "epoch": 0.818, "grad_norm": 0.28633757340130633, "kl": 0.12939453125, "learning_rate": 1.95106202644086e-06, "loss": 0.0629, "reward": 2.26171875, "reward_std": 0.3133533075451851, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2045 }, { "clip_ratio": 0.0, "completion_length": 716.421875, "epoch": 0.8184, "grad_norm": 0.20112006998091686, "kl": 0.1348876953125, "learning_rate": 1.9427841843095063e-06, "loss": 0.0628, "reward": 1.947265625, "reward_std": 0.22060275077819824, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2046 }, { "clip_ratio": 0.0, "completion_length": 661.40625, "epoch": 0.8188, "grad_norm": 0.2846255025924229, "kl": 0.144775390625, "learning_rate": 1.934522050132678e-06, "loss": 0.0541, "reward": 1.986328125, "reward_std": 0.30408646166324615, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2047 }, { "clip_ratio": 0.0, "completion_length": 715.0390625, "epoch": 0.8192, "grad_norm": 0.1235107415708227, "kl": 0.132080078125, "learning_rate": 1.9262756400178163e-06, "loss": 0.0153, "reward": 2.236328125, "reward_std": 0.0546875, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2048 }, { "clip_ratio": 0.0, "completion_length": 701.28125, "epoch": 0.8196, "grad_norm": 0.8813235390216972, "kl": 0.164794921875, "learning_rate": 1.918044970041729e-06, "loss": 0.0553, "reward": 1.970703125, "reward_std": 0.40224993973970413, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2049 }, { "clip_ratio": 0.0, "completion_length": 759.4296875, "epoch": 0.82, "grad_norm": 0.8590034131889572, "kl": 0.1239013671875, "learning_rate": 1.9098300562505266e-06, "loss": 0.0416, "reward": 1.84375, "reward_std": 0.32351601123809814, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9375, "step": 2050 }, { "clip_ratio": 0.0, "completion_length": 793.78125, "epoch": 0.8204, "grad_norm": 0.24887566531050176, "kl": 0.116943359375, "learning_rate": 1.9016309146596024e-06, "loss": 0.0531, "reward": 1.958984375, "reward_std": 0.3082016110420227, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2051 }, { "clip_ratio": 0.0, "completion_length": 679.1640625, "epoch": 0.8208, "grad_norm": 0.2995839533905384, "kl": 0.1259765625, "learning_rate": 1.8934475612536019e-06, "loss": 0.1, "reward": 1.953125, "reward_std": 0.3717610538005829, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96875, "step": 2052 }, { "clip_ratio": 0.0, "completion_length": 725.875, "epoch": 0.8212, "grad_norm": 0.18251932073870858, "kl": 0.134033203125, "learning_rate": 1.8852800119863912e-06, "loss": 0.0558, "reward": 1.955078125, "reward_std": 0.22554133832454681, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2053 }, { "clip_ratio": 0.0, "completion_length": 632.78125, "epoch": 0.8216, "grad_norm": 1.3836490688143448, "kl": 0.191650390625, "learning_rate": 1.8771282827810278e-06, "loss": 0.0443, "reward": 2.203125, "reward_std": 0.19313044100999832, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2054 }, { "clip_ratio": 0.0, "completion_length": 770.21875, "epoch": 0.822, "grad_norm": 0.27670194147709437, "kl": 0.1259765625, "learning_rate": 1.8689923895297247e-06, "loss": 0.0502, "reward": 1.947265625, "reward_std": 0.47308940440416336, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 2055 }, { "clip_ratio": 0.0, "completion_length": 739.21875, "epoch": 0.8224, "grad_norm": 0.21854466953867524, "kl": 0.1214599609375, "learning_rate": 1.8608723480938207e-06, "loss": 0.0804, "reward": 1.984375, "reward_std": 0.3765261843800545, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 2056 }, { "clip_ratio": 0.0, "completion_length": 766.265625, "epoch": 0.8228, "grad_norm": 0.19969317154640606, "kl": 0.138916015625, "learning_rate": 1.8527681743037518e-06, "loss": 0.0262, "reward": 2.083984375, "reward_std": 0.17844055593013763, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2057 }, { "clip_ratio": 0.0, "completion_length": 747.5234375, "epoch": 0.8232, "grad_norm": 1.537428720304166, "kl": 0.18505859375, "learning_rate": 1.8446798839590186e-06, "loss": 0.1255, "reward": 1.837890625, "reward_std": 0.5645126104354858, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.916015625, "step": 2058 }, { "clip_ratio": 0.0, "completion_length": 635.03125, "epoch": 0.8236, "grad_norm": 0.1734538269288526, "kl": 0.13525390625, "learning_rate": 1.8366074928281608e-06, "loss": 0.0589, "reward": 1.998046875, "reward_std": 0.19718601554632187, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2059 }, { "clip_ratio": 0.0, "completion_length": 727.3203125, "epoch": 0.824, "grad_norm": 0.23708246259964474, "kl": 0.126708984375, "learning_rate": 1.8285510166487154e-06, "loss": 0.0502, "reward": 2.02734375, "reward_std": 0.33660992980003357, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2060 }, { "clip_ratio": 0.0, "completion_length": 714.7109375, "epoch": 0.8244, "grad_norm": 0.360542788310325, "kl": 0.140869140625, "learning_rate": 1.820510471127196e-06, "loss": 0.0325, "reward": 2.14453125, "reward_std": 0.17374851554632187, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2061 }, { "clip_ratio": 0.0, "completion_length": 754.71875, "epoch": 0.8248, "grad_norm": 0.32136487580847456, "kl": 0.14013671875, "learning_rate": 1.812485871939056e-06, "loss": 0.0821, "reward": 2.02734375, "reward_std": 0.4191049635410309, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2062 }, { "clip_ratio": 0.0, "completion_length": 759.6484375, "epoch": 0.8252, "grad_norm": 0.18918534133765796, "kl": 0.12646484375, "learning_rate": 1.804477234728661e-06, "loss": 0.0546, "reward": 2.205078125, "reward_std": 0.22543276846408844, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2063 }, { "clip_ratio": 0.0, "completion_length": 757.78125, "epoch": 0.8256, "grad_norm": 1.6047227333486105, "kl": 0.170166015625, "learning_rate": 1.7964845751092663e-06, "loss": 0.0866, "reward": 1.994140625, "reward_std": 0.38667865842580795, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 2064 }, { "clip_ratio": 0.0, "completion_length": 623.4375, "epoch": 0.826, "grad_norm": 0.3176336761494283, "kl": 0.150146484375, "learning_rate": 1.7885079086629598e-06, "loss": 0.2003, "reward": 1.98046875, "reward_std": 0.41543281078338623, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.94921875, "step": 2065 }, { "clip_ratio": 0.0, "completion_length": 679.421875, "epoch": 0.8264, "grad_norm": 0.22942667626518531, "kl": 0.137451171875, "learning_rate": 1.7805472509406695e-06, "loss": 0.0493, "reward": 1.927734375, "reward_std": 0.22148846089839935, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.974609375, "step": 2066 }, { "clip_ratio": 0.0, "completion_length": 723.3203125, "epoch": 0.8268, "grad_norm": 0.2834123470352102, "kl": 0.13134765625, "learning_rate": 1.7726026174621004e-06, "loss": 0.0468, "reward": 2.14453125, "reward_std": 0.3111075162887573, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2067 }, { "clip_ratio": 0.0, "completion_length": 624.65625, "epoch": 0.8272, "grad_norm": 0.7032625093483174, "kl": 0.140625, "learning_rate": 1.7646740237157256e-06, "loss": 0.034, "reward": 2.228515625, "reward_std": 0.3811454772949219, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2068 }, { "clip_ratio": 0.0, "completion_length": 655.328125, "epoch": 0.8276, "grad_norm": 0.18999207941799348, "kl": 0.1297607421875, "learning_rate": 1.7567614851587444e-06, "loss": 0.1246, "reward": 2.0625, "reward_std": 0.3264976888895035, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2069 }, { "clip_ratio": 0.0, "completion_length": 681.40625, "epoch": 0.828, "grad_norm": 0.29371967505422103, "kl": 0.12646484375, "learning_rate": 1.7488650172170496e-06, "loss": 0.0882, "reward": 2.00390625, "reward_std": 0.4095037654042244, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2070 }, { "clip_ratio": 0.0, "completion_length": 702.0, "epoch": 0.8284, "grad_norm": 0.21174647552446837, "kl": 0.1322021484375, "learning_rate": 1.7409846352852144e-06, "loss": 0.0746, "reward": 2.10546875, "reward_std": 0.3335443213582039, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2071 }, { "clip_ratio": 0.0, "completion_length": 725.21875, "epoch": 0.8288, "grad_norm": 0.22442312187615596, "kl": 0.13134765625, "learning_rate": 1.7331203547264452e-06, "loss": 0.0356, "reward": 2.087890625, "reward_std": 0.27919505536556244, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2072 }, { "clip_ratio": 0.0, "completion_length": 750.3828125, "epoch": 0.8292, "grad_norm": 2.1191640800194342, "kl": 0.146484375, "learning_rate": 1.7252721908725633e-06, "loss": 0.0662, "reward": 2.076171875, "reward_std": 0.30752456933259964, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.927734375, "step": 2073 }, { "clip_ratio": 0.0, "completion_length": 631.640625, "epoch": 0.8296, "grad_norm": 0.6434594058727543, "kl": 0.133544921875, "learning_rate": 1.7174401590239587e-06, "loss": 0.0846, "reward": 2.09765625, "reward_std": 0.4078468158841133, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 2074 }, { "clip_ratio": 0.0, "completion_length": 667.453125, "epoch": 0.83, "grad_norm": 0.41502911097143297, "kl": 0.1396484375, "learning_rate": 1.709624274449584e-06, "loss": 0.0676, "reward": 2.216796875, "reward_std": 0.3280906081199646, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2075 }, { "clip_ratio": 0.0, "completion_length": 665.828125, "epoch": 0.8304, "grad_norm": 0.47191344039291133, "kl": 0.164306640625, "learning_rate": 1.7018245523869038e-06, "loss": 0.0273, "reward": 2.041015625, "reward_std": 0.25363312661647797, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 2076 }, { "clip_ratio": 0.0, "completion_length": 642.765625, "epoch": 0.8308, "grad_norm": 0.30763341992458215, "kl": 0.1424560546875, "learning_rate": 1.6940410080418723e-06, "loss": 0.0737, "reward": 2.181640625, "reward_std": 0.33884888887405396, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2077 }, { "clip_ratio": 0.0, "completion_length": 708.2890625, "epoch": 0.8312, "grad_norm": 0.25391987449166553, "kl": 0.13818359375, "learning_rate": 1.686273656588917e-06, "loss": 0.061, "reward": 1.962890625, "reward_std": 0.3678753525018692, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2078 }, { "clip_ratio": 0.0, "completion_length": 723.0390625, "epoch": 0.8316, "grad_norm": 0.13205951638677385, "kl": 0.1253662109375, "learning_rate": 1.6785225131708749e-06, "loss": 0.0271, "reward": 1.9453125, "reward_std": 0.14286844432353973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2079 }, { "clip_ratio": 0.0, "completion_length": 701.5703125, "epoch": 0.832, "grad_norm": 0.6184617087783351, "kl": 0.1529541015625, "learning_rate": 1.6707875928990059e-06, "loss": 0.106, "reward": 1.875, "reward_std": 0.33691153675317764, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 2080 }, { "clip_ratio": 0.0, "completion_length": 703.296875, "epoch": 0.8324, "grad_norm": 0.2003200170800986, "kl": 0.12744140625, "learning_rate": 1.6630689108529286e-06, "loss": 0.1037, "reward": 2.1640625, "reward_std": 0.4258599281311035, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9453125, "step": 2081 }, { "clip_ratio": 0.0, "completion_length": 754.4609375, "epoch": 0.8328, "grad_norm": 1.0211462565173748, "kl": 0.13720703125, "learning_rate": 1.6553664820806102e-06, "loss": 0.0452, "reward": 1.943359375, "reward_std": 0.25440485030412674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2082 }, { "clip_ratio": 0.0, "completion_length": 712.953125, "epoch": 0.8332, "grad_norm": 0.26902471938876116, "kl": 0.12109375, "learning_rate": 1.6476803215983295e-06, "loss": 0.1185, "reward": 2.0, "reward_std": 0.5130334869027138, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.9296875, "step": 2083 }, { "clip_ratio": 0.0, "completion_length": 668.2578125, "epoch": 0.8336, "grad_norm": 0.31540633541743884, "kl": 0.125, "learning_rate": 1.6400104443906463e-06, "loss": 0.0817, "reward": 2.17578125, "reward_std": 0.3871631771326065, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 2084 }, { "clip_ratio": 0.0, "completion_length": 766.2109375, "epoch": 0.834, "grad_norm": 0.2706360164586868, "kl": 0.1357421875, "learning_rate": 1.6323568654103838e-06, "loss": 0.0469, "reward": 1.9765625, "reward_std": 0.35419902205467224, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2085 }, { "clip_ratio": 0.0, "completion_length": 703.1484375, "epoch": 0.8344, "grad_norm": 1.0947605426266878, "kl": 0.1497802734375, "learning_rate": 1.6247195995785836e-06, "loss": 0.0989, "reward": 1.927734375, "reward_std": 0.4391675144433975, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 2086 }, { "clip_ratio": 0.0, "completion_length": 718.484375, "epoch": 0.8348, "grad_norm": 0.18689031890502572, "kl": 0.133056640625, "learning_rate": 1.6170986617844864e-06, "loss": 0.0527, "reward": 2.056640625, "reward_std": 0.23877984285354614, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2087 }, { "clip_ratio": 0.0, "completion_length": 733.8828125, "epoch": 0.8352, "grad_norm": 0.2658837711693862, "kl": 0.13525390625, "learning_rate": 1.6094940668855008e-06, "loss": 0.0946, "reward": 2.068359375, "reward_std": 0.3858395293354988, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 2088 }, { "clip_ratio": 0.0, "completion_length": 667.625, "epoch": 0.8356, "grad_norm": 0.11985176124458018, "kl": 0.129638671875, "learning_rate": 1.601905829707171e-06, "loss": 0.0716, "reward": 2.169921875, "reward_std": 0.1737523227930069, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2089 }, { "clip_ratio": 0.0, "completion_length": 698.34375, "epoch": 0.836, "grad_norm": 0.2809720825573132, "kl": 0.1239013671875, "learning_rate": 1.5943339650431578e-06, "loss": 0.1057, "reward": 1.94921875, "reward_std": 0.410692036151886, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2090 }, { "clip_ratio": 0.0, "completion_length": 703.3046875, "epoch": 0.8364, "grad_norm": 0.22454334246929028, "kl": 0.1494140625, "learning_rate": 1.5867784876551973e-06, "loss": 0.0813, "reward": 1.865234375, "reward_std": 0.39838702231645584, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 2091 }, { "clip_ratio": 0.0, "completion_length": 779.640625, "epoch": 0.8368, "grad_norm": 0.27088917465003, "kl": 0.153564453125, "learning_rate": 1.579239412273078e-06, "loss": 0.082, "reward": 1.96484375, "reward_std": 0.36732660233974457, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 2092 }, { "clip_ratio": 0.0, "completion_length": 666.359375, "epoch": 0.8372, "grad_norm": 0.32030391831106225, "kl": 0.13330078125, "learning_rate": 1.5717167535946142e-06, "loss": 0.0772, "reward": 1.951171875, "reward_std": 0.3426539748907089, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2093 }, { "clip_ratio": 0.0, "completion_length": 756.203125, "epoch": 0.8376, "grad_norm": 0.24243703126581775, "kl": 0.136962890625, "learning_rate": 1.5642105262856122e-06, "loss": 0.0978, "reward": 1.873046875, "reward_std": 0.49551019072532654, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.927734375, "step": 2094 }, { "clip_ratio": 0.0, "completion_length": 723.3359375, "epoch": 0.838, "grad_norm": 0.2725546864281954, "kl": 0.117919921875, "learning_rate": 1.5567207449798517e-06, "loss": 0.0384, "reward": 1.955078125, "reward_std": 0.367541067302227, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 2095 }, { "clip_ratio": 0.0, "completion_length": 720.4140625, "epoch": 0.8384, "grad_norm": 0.6254485957310828, "kl": 0.138916015625, "learning_rate": 1.5492474242790368e-06, "loss": 0.1043, "reward": 1.95703125, "reward_std": 0.4403374269604683, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.93359375, "step": 2096 }, { "clip_ratio": 0.0, "completion_length": 762.453125, "epoch": 0.8388, "grad_norm": 0.18911434837106325, "kl": 0.1204833984375, "learning_rate": 1.5417905787527943e-06, "loss": 0.0703, "reward": 1.919921875, "reward_std": 0.34604495763778687, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.966796875, "step": 2097 }, { "clip_ratio": 0.0, "completion_length": 797.8203125, "epoch": 0.8392, "grad_norm": 0.25685388189088026, "kl": 0.133056640625, "learning_rate": 1.5343502229386209e-06, "loss": 0.0773, "reward": 1.92578125, "reward_std": 0.42423198372125626, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 2098 }, { "clip_ratio": 0.0, "completion_length": 736.5390625, "epoch": 0.8396, "grad_norm": 0.2693836585961689, "kl": 0.148681640625, "learning_rate": 1.526926371341878e-06, "loss": 0.0623, "reward": 2.015625, "reward_std": 0.331501767039299, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 2099 }, { "clip_ratio": 0.0, "completion_length": 697.875, "epoch": 0.84, "grad_norm": 0.3194272289382016, "kl": 0.129638671875, "learning_rate": 1.5195190384357405e-06, "loss": 0.0628, "reward": 2.150390625, "reward_std": 0.2959893196821213, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2100 }, { "clip_ratio": 0.0, "completion_length": 683.5546875, "epoch": 0.8404, "grad_norm": 0.17951912302826986, "kl": 0.1318359375, "learning_rate": 1.5121282386611823e-06, "loss": 0.0215, "reward": 1.974609375, "reward_std": 0.1308765709400177, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2101 }, { "clip_ratio": 0.0, "completion_length": 701.546875, "epoch": 0.8408, "grad_norm": 0.6302504238465177, "kl": 0.121826171875, "learning_rate": 1.5047539864269477e-06, "loss": 0.0782, "reward": 1.921875, "reward_std": 0.27784235030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2102 }, { "clip_ratio": 0.0, "completion_length": 737.0546875, "epoch": 0.8412, "grad_norm": 0.19147410710164764, "kl": 0.1322021484375, "learning_rate": 1.4973962961095135e-06, "loss": 0.0609, "reward": 1.986328125, "reward_std": 0.34693287312984467, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2103 }, { "clip_ratio": 0.0, "completion_length": 675.171875, "epoch": 0.8416, "grad_norm": 0.42586099820648593, "kl": 0.1494140625, "learning_rate": 1.490055182053083e-06, "loss": 0.0664, "reward": 2.099609375, "reward_std": 0.45340484380722046, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2104 }, { "clip_ratio": 0.0, "completion_length": 731.9921875, "epoch": 0.842, "grad_norm": 0.29059067810286104, "kl": 0.131591796875, "learning_rate": 1.4827306585695234e-06, "loss": 0.0697, "reward": 1.779296875, "reward_std": 0.3029678240418434, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.904296875, "step": 2105 }, { "clip_ratio": 0.0, "completion_length": 696.046875, "epoch": 0.8424, "grad_norm": 0.8669042881625517, "kl": 0.149658203125, "learning_rate": 1.4754227399383758e-06, "loss": 0.0359, "reward": 2.33984375, "reward_std": 0.2401670292019844, "rewards/accuracy_reward": 0.3671875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2106 }, { "clip_ratio": 0.0, "completion_length": 666.71875, "epoch": 0.8428, "grad_norm": 0.5472332204667342, "kl": 0.145263671875, "learning_rate": 1.468131440406798e-06, "loss": 0.1654, "reward": 1.873046875, "reward_std": 0.43931056559085846, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.935546875, "step": 2107 }, { "clip_ratio": 0.0, "completion_length": 714.8828125, "epoch": 0.8432, "grad_norm": 0.19737913796847295, "kl": 0.12744140625, "learning_rate": 1.4608567741895496e-06, "loss": 0.0376, "reward": 2.033203125, "reward_std": 0.2230696976184845, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2108 }, { "clip_ratio": 0.0, "completion_length": 757.1796875, "epoch": 0.8436, "grad_norm": 0.4067389237750651, "kl": 0.125, "learning_rate": 1.4535987554689712e-06, "loss": 0.0616, "reward": 1.91796875, "reward_std": 0.35184716433286667, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2109 }, { "clip_ratio": 0.0, "completion_length": 685.171875, "epoch": 0.844, "grad_norm": 0.2846015694747191, "kl": 0.134521484375, "learning_rate": 1.446357398394934e-06, "loss": 0.0844, "reward": 1.96484375, "reward_std": 0.38910188525915146, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 2110 }, { "clip_ratio": 0.0, "completion_length": 659.4296875, "epoch": 0.8444, "grad_norm": 0.19361228115922327, "kl": 0.1361083984375, "learning_rate": 1.439132717084839e-06, "loss": 0.0699, "reward": 1.89453125, "reward_std": 0.3113357946276665, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2111 }, { "clip_ratio": 0.0, "completion_length": 746.1171875, "epoch": 0.8448, "grad_norm": 0.24205889729358496, "kl": 0.1256103515625, "learning_rate": 1.4319247256235713e-06, "loss": 0.0757, "reward": 2.025390625, "reward_std": 0.3697395622730255, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2112 }, { "clip_ratio": 0.0, "completion_length": 759.875, "epoch": 0.8452, "grad_norm": 0.25744268154474165, "kl": 0.1326904296875, "learning_rate": 1.4247334380634792e-06, "loss": 0.0651, "reward": 1.716796875, "reward_std": 0.3484623655676842, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.7578125, "rewards/tag_count_reward": 0.818359375, "step": 2113 }, { "clip_ratio": 0.0, "completion_length": 661.703125, "epoch": 0.8456, "grad_norm": 0.3147216543532428, "kl": 0.119873046875, "learning_rate": 1.4175588684243447e-06, "loss": 0.0676, "reward": 2.025390625, "reward_std": 0.41916733980178833, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 2114 }, { "clip_ratio": 0.0, "completion_length": 609.609375, "epoch": 0.846, "grad_norm": 0.362652785971351, "kl": 0.13427734375, "learning_rate": 1.4104010306933558e-06, "loss": 0.043, "reward": 2.037109375, "reward_std": 0.2523764297366142, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2115 }, { "clip_ratio": 0.0, "completion_length": 745.515625, "epoch": 0.8464, "grad_norm": 0.20207977485162765, "kl": 0.137939453125, "learning_rate": 1.40325993882509e-06, "loss": 0.0849, "reward": 1.857421875, "reward_std": 0.3776264563202858, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 2116 }, { "clip_ratio": 0.0, "completion_length": 707.828125, "epoch": 0.8468, "grad_norm": 0.37237255018869536, "kl": 0.128662109375, "learning_rate": 1.3961356067414667e-06, "loss": 0.1113, "reward": 1.88671875, "reward_std": 0.39663857221603394, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 2117 }, { "clip_ratio": 0.0, "completion_length": 729.4921875, "epoch": 0.8472, "grad_norm": 0.4683941248489172, "kl": 0.130126953125, "learning_rate": 1.3890280483317375e-06, "loss": 0.0423, "reward": 2.0859375, "reward_std": 0.19213032722473145, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2118 }, { "clip_ratio": 0.0, "completion_length": 692.8671875, "epoch": 0.8476, "grad_norm": 0.2936596271418644, "kl": 0.1273193359375, "learning_rate": 1.381937277452451e-06, "loss": 0.0768, "reward": 2.212890625, "reward_std": 0.4240039363503456, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 2119 }, { "clip_ratio": 0.0, "completion_length": 738.6953125, "epoch": 0.848, "grad_norm": 0.26131165623945846, "kl": 0.12939453125, "learning_rate": 1.3748633079274254e-06, "loss": 0.0564, "reward": 1.984375, "reward_std": 0.3210155963897705, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 2120 }, { "clip_ratio": 0.0, "completion_length": 723.78125, "epoch": 0.8484, "grad_norm": 0.3527206501494313, "kl": 0.1307373046875, "learning_rate": 1.3678061535477305e-06, "loss": 0.1002, "reward": 2.0, "reward_std": 0.3285759389400482, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.953125, "step": 2121 }, { "clip_ratio": 0.0, "completion_length": 681.0546875, "epoch": 0.8488, "grad_norm": 0.37780019796727027, "kl": 0.12548828125, "learning_rate": 1.3607658280716474e-06, "loss": 0.1462, "reward": 1.94140625, "reward_std": 0.43466294556856155, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.92578125, "step": 2122 }, { "clip_ratio": 0.0, "completion_length": 718.390625, "epoch": 0.8492, "grad_norm": 0.44615849084958, "kl": 0.142333984375, "learning_rate": 1.3537423452246522e-06, "loss": 0.0987, "reward": 1.939453125, "reward_std": 0.3879173472523689, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2123 }, { "clip_ratio": 0.0, "completion_length": 606.2421875, "epoch": 0.8496, "grad_norm": 0.24474527476787658, "kl": 0.140625, "learning_rate": 1.3467357186993802e-06, "loss": 0.1788, "reward": 2.177734375, "reward_std": 0.4744638279080391, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 2124 }, { "clip_ratio": 0.0, "completion_length": 765.5625, "epoch": 0.85, "grad_norm": 0.6563832924103487, "kl": 0.1285400390625, "learning_rate": 1.339745962155613e-06, "loss": 0.0573, "reward": 2.140625, "reward_std": 0.3538324758410454, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2125 }, { "clip_ratio": 0.0, "completion_length": 725.7109375, "epoch": 0.8504, "grad_norm": 0.558577539934642, "kl": 0.1453857421875, "learning_rate": 1.3327730892202384e-06, "loss": 0.0908, "reward": 2.162109375, "reward_std": 0.24534359574317932, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2126 }, { "clip_ratio": 0.0, "completion_length": 751.5859375, "epoch": 0.8508, "grad_norm": 0.192330138513784, "kl": 0.119873046875, "learning_rate": 1.3258171134872267e-06, "loss": 0.0394, "reward": 1.94140625, "reward_std": 0.2667705565690994, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 2127 }, { "clip_ratio": 0.0, "completion_length": 679.2578125, "epoch": 0.8512, "grad_norm": 0.31957585605528427, "kl": 0.1226806640625, "learning_rate": 1.3188780485176089e-06, "loss": 0.0361, "reward": 2.017578125, "reward_std": 0.3866526857018471, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2128 }, { "clip_ratio": 0.0, "completion_length": 739.7421875, "epoch": 0.8516, "grad_norm": 0.21052188440651756, "kl": 0.1204833984375, "learning_rate": 1.3119559078394462e-06, "loss": 0.0196, "reward": 2.033203125, "reward_std": 0.14577669650316238, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2129 }, { "clip_ratio": 0.0, "completion_length": 751.7890625, "epoch": 0.852, "grad_norm": 0.4753083068420027, "kl": 0.1527099609375, "learning_rate": 1.30505070494781e-06, "loss": 0.0686, "reward": 1.873046875, "reward_std": 0.28962603211402893, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.943359375, "step": 2130 }, { "clip_ratio": 0.0, "completion_length": 798.875, "epoch": 0.8524, "grad_norm": 0.35134184123117956, "kl": 0.1439208984375, "learning_rate": 1.2981624533047432e-06, "loss": 0.0643, "reward": 1.98828125, "reward_std": 0.390874020755291, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94921875, "step": 2131 }, { "clip_ratio": 0.0, "completion_length": 751.546875, "epoch": 0.8528, "grad_norm": 0.2739153062831752, "kl": 0.15966796875, "learning_rate": 1.2912911663392468e-06, "loss": 0.083, "reward": 1.861328125, "reward_std": 0.4094906374812126, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.939453125, "step": 2132 }, { "clip_ratio": 0.0, "completion_length": 780.1015625, "epoch": 0.8532, "grad_norm": 0.3409286598445095, "kl": 0.12841796875, "learning_rate": 1.2844368574472454e-06, "loss": 0.0234, "reward": 1.92578125, "reward_std": 0.1856590062379837, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2133 }, { "clip_ratio": 0.0, "completion_length": 642.796875, "epoch": 0.8536, "grad_norm": 0.30742005932314126, "kl": 0.140869140625, "learning_rate": 1.277599539991563e-06, "loss": 0.0259, "reward": 2.009765625, "reward_std": 0.26694802194833755, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2134 }, { "clip_ratio": 0.0, "completion_length": 640.609375, "epoch": 0.854, "grad_norm": 0.17904752392986012, "kl": 0.1298828125, "learning_rate": 1.2707792273019049e-06, "loss": 0.0222, "reward": 2.1796875, "reward_std": 0.12654343992471695, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2135 }, { "clip_ratio": 0.0, "completion_length": 756.9375, "epoch": 0.8544, "grad_norm": 0.771758678431535, "kl": 0.14111328125, "learning_rate": 1.2639759326748136e-06, "loss": 0.0694, "reward": 1.84765625, "reward_std": 0.4020465165376663, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 2136 }, { "clip_ratio": 0.0, "completion_length": 737.234375, "epoch": 0.8548, "grad_norm": 0.18210124309345999, "kl": 0.12060546875, "learning_rate": 1.257189669373664e-06, "loss": 0.0545, "reward": 1.91015625, "reward_std": 0.29601940512657166, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 2137 }, { "clip_ratio": 0.0, "completion_length": 703.703125, "epoch": 0.8552, "grad_norm": 0.2332476377010176, "kl": 0.1416015625, "learning_rate": 1.2504204506286244e-06, "loss": 0.0819, "reward": 2.08203125, "reward_std": 0.296875, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.97265625, "step": 2138 }, { "clip_ratio": 0.0, "completion_length": 716.3828125, "epoch": 0.8556, "grad_norm": 0.43581666702271704, "kl": 0.1146240234375, "learning_rate": 1.2436682896366282e-06, "loss": 0.0356, "reward": 2.25390625, "reward_std": 0.33147957921028137, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2139 }, { "clip_ratio": 0.0, "completion_length": 709.8515625, "epoch": 0.856, "grad_norm": 0.26521268454976726, "kl": 0.1298828125, "learning_rate": 1.2369331995613664e-06, "loss": 0.0799, "reward": 1.90625, "reward_std": 0.26002171635627747, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 2140 }, { "clip_ratio": 0.0, "completion_length": 736.3203125, "epoch": 0.8564, "grad_norm": 0.14213284958551273, "kl": 0.124267578125, "learning_rate": 1.230215193533233e-06, "loss": 0.0157, "reward": 2.212890625, "reward_std": 0.11973956227302551, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2141 }, { "clip_ratio": 0.0, "completion_length": 675.421875, "epoch": 0.8568, "grad_norm": 0.2899297027615894, "kl": 0.1253662109375, "learning_rate": 1.223514284649331e-06, "loss": 0.0214, "reward": 1.974609375, "reward_std": 0.19190485030412674, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2142 }, { "clip_ratio": 0.0, "completion_length": 776.796875, "epoch": 0.8572, "grad_norm": 0.21783238150404707, "kl": 0.120361328125, "learning_rate": 1.2168304859734226e-06, "loss": 0.0355, "reward": 1.935546875, "reward_std": 0.3022633120417595, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.966796875, "step": 2143 }, { "clip_ratio": 0.0, "completion_length": 689.1640625, "epoch": 0.8576, "grad_norm": 0.4184525282721595, "kl": 0.141845703125, "learning_rate": 1.210163810535917e-06, "loss": 0.1171, "reward": 2.078125, "reward_std": 0.547583244740963, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9453125, "step": 2144 }, { "clip_ratio": 0.0, "completion_length": 610.03125, "epoch": 0.858, "grad_norm": 0.3684587971367006, "kl": 0.14453125, "learning_rate": 1.2035142713338366e-06, "loss": 0.111, "reward": 1.994140625, "reward_std": 0.3201225735247135, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 2145 }, { "clip_ratio": 0.0, "completion_length": 699.265625, "epoch": 0.8584, "grad_norm": 0.23394790621910028, "kl": 0.1370849609375, "learning_rate": 1.196881881330798e-06, "loss": 0.0749, "reward": 2.04296875, "reward_std": 0.3413620740175247, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2146 }, { "clip_ratio": 0.0, "completion_length": 602.6796875, "epoch": 0.8588, "grad_norm": 1.1392483412418872, "kl": 0.13037109375, "learning_rate": 1.1902666534569884e-06, "loss": 0.0311, "reward": 2.203125, "reward_std": 0.28588032722473145, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2147 }, { "clip_ratio": 0.0, "completion_length": 693.203125, "epoch": 0.8592, "grad_norm": 0.2926884144316323, "kl": 0.119873046875, "learning_rate": 1.1836686006091313e-06, "loss": 0.1017, "reward": 2.06640625, "reward_std": 0.494917631149292, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2148 }, { "clip_ratio": 0.0, "completion_length": 708.5390625, "epoch": 0.8596, "grad_norm": 0.2732286434914359, "kl": 0.1307373046875, "learning_rate": 1.1770877356504684e-06, "loss": 0.0351, "reward": 2.107421875, "reward_std": 0.3964216634631157, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2149 }, { "clip_ratio": 0.0, "completion_length": 723.9140625, "epoch": 0.86, "grad_norm": 0.18945670932085662, "kl": 0.1138916015625, "learning_rate": 1.1705240714107301e-06, "loss": 0.0285, "reward": 1.994140625, "reward_std": 0.16968154907226562, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2150 }, { "clip_ratio": 0.0, "completion_length": 594.578125, "epoch": 0.8604, "grad_norm": 0.36477541218225, "kl": 0.125732421875, "learning_rate": 1.1639776206861197e-06, "loss": 0.0174, "reward": 2.162109375, "reward_std": 0.3820062205195427, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2151 }, { "clip_ratio": 0.0, "completion_length": 681.3203125, "epoch": 0.8608, "grad_norm": 0.5194575298794798, "kl": 0.134521484375, "learning_rate": 1.1574483962392768e-06, "loss": 0.1584, "reward": 2.14453125, "reward_std": 0.4315648227930069, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2152 }, { "clip_ratio": 0.0, "completion_length": 757.453125, "epoch": 0.8612, "grad_norm": 0.31197466306268956, "kl": 0.129638671875, "learning_rate": 1.1509364107992582e-06, "loss": 0.0565, "reward": 2.001953125, "reward_std": 0.40205247700214386, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2153 }, { "clip_ratio": 0.0, "completion_length": 720.9140625, "epoch": 0.8616, "grad_norm": 0.31781564432964776, "kl": 0.1280517578125, "learning_rate": 1.1444416770615118e-06, "loss": 0.0606, "reward": 2.080078125, "reward_std": 0.4046481251716614, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 2154 }, { "clip_ratio": 0.0, "completion_length": 647.7421875, "epoch": 0.862, "grad_norm": 0.37145738774612586, "kl": 0.1251220703125, "learning_rate": 1.1379642076878528e-06, "loss": 0.1066, "reward": 2.21875, "reward_std": 0.4275077283382416, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2155 }, { "clip_ratio": 0.0, "completion_length": 685.78125, "epoch": 0.8624, "grad_norm": 0.3421660232391935, "kl": 0.132080078125, "learning_rate": 1.1315040153064416e-06, "loss": 0.0704, "reward": 2.029296875, "reward_std": 0.3134972006082535, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2156 }, { "clip_ratio": 0.0, "completion_length": 673.8359375, "epoch": 0.8628, "grad_norm": 0.3339475179219242, "kl": 0.134521484375, "learning_rate": 1.1250611125117527e-06, "loss": 0.0589, "reward": 2.1953125, "reward_std": 0.3901100903749466, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2157 }, { "clip_ratio": 0.0, "completion_length": 755.765625, "epoch": 0.8632, "grad_norm": 0.14426326200343606, "kl": 0.119873046875, "learning_rate": 1.1186355118645552e-06, "loss": 0.0257, "reward": 2.091796875, "reward_std": 0.1328125, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 2158 }, { "clip_ratio": 0.0, "completion_length": 731.15625, "epoch": 0.8636, "grad_norm": 0.43491910971307596, "kl": 0.126708984375, "learning_rate": 1.1122272258918864e-06, "loss": 0.0321, "reward": 2.0625, "reward_std": 0.28071650862693787, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2159 }, { "clip_ratio": 0.0, "completion_length": 674.71875, "epoch": 0.864, "grad_norm": 0.21572251996636962, "kl": 0.1241455078125, "learning_rate": 1.1058362670870248e-06, "loss": 0.0261, "reward": 2.08984375, "reward_std": 0.21755754202604294, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2160 }, { "clip_ratio": 0.0, "completion_length": 702.5234375, "epoch": 0.8644, "grad_norm": 0.44967969154255666, "kl": 0.154296875, "learning_rate": 1.0994626479094749e-06, "loss": 0.0869, "reward": 2.06640625, "reward_std": 0.3632311001420021, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2161 }, { "clip_ratio": 0.0, "completion_length": 646.9453125, "epoch": 0.8648, "grad_norm": 0.29846227725895796, "kl": 0.145751953125, "learning_rate": 1.093106380784934e-06, "loss": 0.0715, "reward": 2.037109375, "reward_std": 0.25535898655653, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2162 }, { "clip_ratio": 0.0, "completion_length": 691.671875, "epoch": 0.8652, "grad_norm": 0.2115803889211379, "kl": 0.1334228515625, "learning_rate": 1.0867674781052683e-06, "loss": 0.0792, "reward": 1.91796875, "reward_std": 0.2072029784321785, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2163 }, { "clip_ratio": 0.0, "completion_length": 657.140625, "epoch": 0.8656, "grad_norm": 0.1976223248570973, "kl": 0.1224365234375, "learning_rate": 1.0804459522284927e-06, "loss": 0.0363, "reward": 2.19921875, "reward_std": 0.2157595381140709, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2164 }, { "clip_ratio": 0.0, "completion_length": 760.6875, "epoch": 0.866, "grad_norm": 0.19036157862767616, "kl": 0.1180419921875, "learning_rate": 1.0741418154787443e-06, "loss": 0.045, "reward": 1.96484375, "reward_std": 0.2569936662912369, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2165 }, { "clip_ratio": 0.0, "completion_length": 672.1328125, "epoch": 0.8664, "grad_norm": 0.2431871819372251, "kl": 0.14013671875, "learning_rate": 1.0678550801462662e-06, "loss": 0.0436, "reward": 2.134765625, "reward_std": 0.29090986400842667, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2166 }, { "clip_ratio": 0.0, "completion_length": 667.3515625, "epoch": 0.8668, "grad_norm": 0.31948114976178327, "kl": 0.157958984375, "learning_rate": 1.0615857584873624e-06, "loss": 0.0666, "reward": 2.0703125, "reward_std": 0.18409235030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2167 }, { "clip_ratio": 0.0, "completion_length": 711.6640625, "epoch": 0.8672, "grad_norm": 1.6212698480546335, "kl": 0.132568359375, "learning_rate": 1.0553338627244026e-06, "loss": 0.0469, "reward": 2.044921875, "reward_std": 0.29828938841819763, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2168 }, { "clip_ratio": 0.0, "completion_length": 641.9375, "epoch": 0.8676, "grad_norm": 0.1946098028443918, "kl": 0.1180419921875, "learning_rate": 1.0490994050457748e-06, "loss": 0.0236, "reward": 2.033203125, "reward_std": 0.1171875, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2169 }, { "clip_ratio": 0.0, "completion_length": 770.1484375, "epoch": 0.868, "grad_norm": 0.2255938581667295, "kl": 0.11328125, "learning_rate": 1.042882397605871e-06, "loss": 0.0506, "reward": 2.001953125, "reward_std": 0.32739967107772827, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2170 }, { "clip_ratio": 0.0, "completion_length": 641.546875, "epoch": 0.8684, "grad_norm": 0.5712297732165551, "kl": 0.1090087890625, "learning_rate": 1.0366828525250728e-06, "loss": 0.1041, "reward": 2.017578125, "reward_std": 0.3997393175959587, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 2171 }, { "clip_ratio": 0.0, "completion_length": 632.140625, "epoch": 0.8688, "grad_norm": 0.19339378993100298, "kl": 0.11474609375, "learning_rate": 1.0305007818897006e-06, "loss": 0.1158, "reward": 2.017578125, "reward_std": 0.27201082557439804, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2172 }, { "clip_ratio": 0.0, "completion_length": 701.7734375, "epoch": 0.8692, "grad_norm": 0.1965411247569072, "kl": 0.11083984375, "learning_rate": 1.024336197752025e-06, "loss": 0.106, "reward": 2.01953125, "reward_std": 0.2613852098584175, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2173 }, { "clip_ratio": 0.0, "completion_length": 720.5234375, "epoch": 0.8696, "grad_norm": 0.581263927809315, "kl": 0.1383056640625, "learning_rate": 1.0181891121302145e-06, "loss": 0.07, "reward": 1.9921875, "reward_std": 0.24901220202445984, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2174 }, { "clip_ratio": 0.0, "completion_length": 676.5234375, "epoch": 0.87, "grad_norm": 0.24259180839080727, "kl": 0.120361328125, "learning_rate": 1.012059537008332e-06, "loss": 0.0627, "reward": 1.9765625, "reward_std": 0.23947983980178833, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2175 }, { "clip_ratio": 0.0, "completion_length": 663.765625, "epoch": 0.8704, "grad_norm": 0.22269860463520316, "kl": 0.135498046875, "learning_rate": 1.0059474843362893e-06, "loss": 0.0635, "reward": 2.212890625, "reward_std": 0.1484375, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2176 }, { "clip_ratio": 0.0, "completion_length": 693.9453125, "epoch": 0.8708, "grad_norm": 0.5101717407298362, "kl": 0.1688232421875, "learning_rate": 9.99852966029854e-07, "loss": 0.1218, "reward": 1.896484375, "reward_std": 0.43742595613002777, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.943359375, "step": 2177 }, { "clip_ratio": 0.0, "completion_length": 755.6015625, "epoch": 0.8712, "grad_norm": 1.1014231780173214, "kl": 0.1612548828125, "learning_rate": 9.93775993970597e-07, "loss": 0.0523, "reward": 1.90625, "reward_std": 0.2609814256429672, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 2178 }, { "clip_ratio": 0.0, "completion_length": 667.0703125, "epoch": 0.8716, "grad_norm": 0.2353189431172995, "kl": 0.148681640625, "learning_rate": 9.877165800058874e-07, "loss": 0.0622, "reward": 2.001953125, "reward_std": 0.18572373688220978, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2179 }, { "clip_ratio": 0.0, "completion_length": 761.328125, "epoch": 0.872, "grad_norm": 11.317669841849868, "kl": 0.132080078125, "learning_rate": 9.816747359488632e-07, "loss": 0.029, "reward": 2.033203125, "reward_std": 0.33196407556533813, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2180 }, { "clip_ratio": 0.0, "completion_length": 722.859375, "epoch": 0.8724, "grad_norm": 0.19273437371239024, "kl": 0.128662109375, "learning_rate": 9.756504735784067e-07, "loss": 0.0473, "reward": 1.9375, "reward_std": 0.24318470060825348, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2181 }, { "clip_ratio": 0.0, "completion_length": 707.390625, "epoch": 0.8728, "grad_norm": 0.215844736757589, "kl": 0.128173828125, "learning_rate": 9.696438046391288e-07, "loss": 0.0814, "reward": 1.884765625, "reward_std": 0.3890254348516464, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.939453125, "step": 2182 }, { "clip_ratio": 0.0, "completion_length": 729.8046875, "epoch": 0.8732, "grad_norm": 0.18866062155551533, "kl": 0.13427734375, "learning_rate": 9.636547408413355e-07, "loss": 0.0574, "reward": 1.91796875, "reward_std": 0.2588096931576729, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2183 }, { "clip_ratio": 0.0, "completion_length": 652.7578125, "epoch": 0.8736, "grad_norm": 0.30291115593407736, "kl": 0.139892578125, "learning_rate": 9.576832938610137e-07, "loss": 0.0548, "reward": 2.09375, "reward_std": 0.23687045276165009, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2184 }, { "clip_ratio": 0.0, "completion_length": 685.0390625, "epoch": 0.874, "grad_norm": 0.18244883259254152, "kl": 0.13818359375, "learning_rate": 9.517294753398066e-07, "loss": 0.0101, "reward": 2.314453125, "reward_std": 0.13125257194042206, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2185 }, { "clip_ratio": 0.0, "completion_length": 611.0234375, "epoch": 0.8744, "grad_norm": 0.33863690122565004, "kl": 0.142333984375, "learning_rate": 9.457932968849826e-07, "loss": 0.0289, "reward": 2.017578125, "reward_std": 0.15988312661647797, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2186 }, { "clip_ratio": 0.0, "completion_length": 797.203125, "epoch": 0.8748, "grad_norm": 0.30610407248971455, "kl": 0.1351318359375, "learning_rate": 9.398747700694322e-07, "loss": 0.083, "reward": 1.990234375, "reward_std": 0.467099130153656, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.919921875, "step": 2187 }, { "clip_ratio": 0.0, "completion_length": 754.6796875, "epoch": 0.8752, "grad_norm": 0.23172753075913077, "kl": 0.114013671875, "learning_rate": 9.339739064316233e-07, "loss": 0.0881, "reward": 1.951171875, "reward_std": 0.40919866412878036, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 2188 }, { "clip_ratio": 0.0, "completion_length": 675.0078125, "epoch": 0.8756, "grad_norm": 0.25089974507037516, "kl": 0.133544921875, "learning_rate": 9.280907174755916e-07, "loss": 0.0849, "reward": 2.068359375, "reward_std": 0.33825747668743134, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2189 }, { "clip_ratio": 0.0, "completion_length": 703.28125, "epoch": 0.876, "grad_norm": 0.2129894776367784, "kl": 0.13427734375, "learning_rate": 9.222252146709143e-07, "loss": 0.0565, "reward": 1.986328125, "reward_std": 0.20121560245752335, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2190 }, { "clip_ratio": 0.0, "completion_length": 586.5390625, "epoch": 0.8764, "grad_norm": 0.25345221659578065, "kl": 0.14501953125, "learning_rate": 9.16377409452689e-07, "loss": 0.0692, "reward": 2.16796875, "reward_std": 0.24074658751487732, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2191 }, { "clip_ratio": 0.0, "completion_length": 723.890625, "epoch": 0.8768, "grad_norm": 0.3945449689047406, "kl": 0.1300048828125, "learning_rate": 9.105473132215126e-07, "loss": 0.0712, "reward": 1.94140625, "reward_std": 0.374765008687973, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 2192 }, { "clip_ratio": 0.0, "completion_length": 634.328125, "epoch": 0.8772, "grad_norm": 0.5255656482245834, "kl": 0.13916015625, "learning_rate": 9.047349373434566e-07, "loss": 0.0651, "reward": 2.169921875, "reward_std": 0.24095968157052994, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2193 }, { "clip_ratio": 0.0, "completion_length": 655.03125, "epoch": 0.8776, "grad_norm": 0.3036642857508571, "kl": 0.13818359375, "learning_rate": 8.989402931500434e-07, "loss": 0.1073, "reward": 2.080078125, "reward_std": 0.38138431310653687, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2194 }, { "clip_ratio": 0.0, "completion_length": 734.5859375, "epoch": 0.878, "grad_norm": 0.3386150713004865, "kl": 0.11279296875, "learning_rate": 8.931633919382299e-07, "loss": 0.0636, "reward": 1.96875, "reward_std": 0.41076524555683136, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 2195 }, { "clip_ratio": 0.0, "completion_length": 693.578125, "epoch": 0.8784, "grad_norm": 0.22957548343178655, "kl": 0.1243896484375, "learning_rate": 8.874042449703779e-07, "loss": 0.009, "reward": 2.248046875, "reward_std": 0.0703125, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 2196 }, { "clip_ratio": 0.0, "completion_length": 718.15625, "epoch": 0.8788, "grad_norm": 0.27501026882414686, "kl": 0.126708984375, "learning_rate": 8.816628634742441e-07, "loss": 0.017, "reward": 2.228515625, "reward_std": 0.0859375, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2197 }, { "clip_ratio": 0.0, "completion_length": 644.546875, "epoch": 0.8792, "grad_norm": 0.3121810309087176, "kl": 0.1239013671875, "learning_rate": 8.759392586429394e-07, "loss": 0.0269, "reward": 2.19921875, "reward_std": 0.2771979495882988, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2198 }, { "clip_ratio": 0.0, "completion_length": 746.4609375, "epoch": 0.8796, "grad_norm": 0.3327204687406081, "kl": 0.134521484375, "learning_rate": 8.702334416349279e-07, "loss": 0.107, "reward": 1.943359375, "reward_std": 0.4872926324605942, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.935546875, "step": 2199 }, { "clip_ratio": 0.0, "completion_length": 711.765625, "epoch": 0.88, "grad_norm": 0.8538490565100337, "kl": 0.118408203125, "learning_rate": 8.645454235739903e-07, "loss": 0.0545, "reward": 2.087890625, "reward_std": 0.30817168205976486, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.970703125, "step": 2200 }, { "clip_ratio": 0.0, "completion_length": 681.421875, "epoch": 0.8804, "grad_norm": 0.18573696058155334, "kl": 0.116943359375, "learning_rate": 8.58875215549212e-07, "loss": 0.0474, "reward": 2.111328125, "reward_std": 0.2818055525422096, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2201 }, { "clip_ratio": 0.0, "completion_length": 685.7421875, "epoch": 0.8808, "grad_norm": 0.16931540158930544, "kl": 0.13671875, "learning_rate": 8.532228286149502e-07, "loss": 0.0362, "reward": 2.12109375, "reward_std": 0.1335124969482422, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2202 }, { "clip_ratio": 0.0, "completion_length": 711.8671875, "epoch": 0.8812, "grad_norm": 0.23149549342764322, "kl": 0.130615234375, "learning_rate": 8.475882737908248e-07, "loss": 0.0637, "reward": 2.171875, "reward_std": 0.28625842928886414, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2203 }, { "clip_ratio": 0.0, "completion_length": 714.046875, "epoch": 0.8816, "grad_norm": 0.3036047495506452, "kl": 0.1197509765625, "learning_rate": 8.419715620616875e-07, "loss": 0.0422, "reward": 2.02734375, "reward_std": 0.203125, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2204 }, { "clip_ratio": 0.0, "completion_length": 621.3671875, "epoch": 0.882, "grad_norm": 0.37877406856776424, "kl": 0.1451416015625, "learning_rate": 8.363727043776037e-07, "loss": 0.062, "reward": 2.029296875, "reward_std": 0.279289186000824, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2205 }, { "clip_ratio": 0.0, "completion_length": 565.2109375, "epoch": 0.8824, "grad_norm": 0.6480600550214601, "kl": 0.155517578125, "learning_rate": 8.307917116538378e-07, "loss": 0.0532, "reward": 2.189453125, "reward_std": 0.2465549185872078, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2206 }, { "clip_ratio": 0.0, "completion_length": 679.625, "epoch": 0.8828, "grad_norm": 0.6403884886499075, "kl": 0.15185546875, "learning_rate": 8.252285947708139e-07, "loss": 0.0687, "reward": 1.96484375, "reward_std": 0.36745496094226837, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2207 }, { "clip_ratio": 0.0, "completion_length": 707.2265625, "epoch": 0.8832, "grad_norm": 0.23407292560371806, "kl": 0.136474609375, "learning_rate": 8.196833645741187e-07, "loss": 0.0816, "reward": 2.0703125, "reward_std": 0.18409235030412674, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2208 }, { "clip_ratio": 0.0, "completion_length": 638.6484375, "epoch": 0.8836, "grad_norm": 0.2821409011869321, "kl": 0.131591796875, "learning_rate": 8.141560318744601e-07, "loss": 0.0848, "reward": 2.03125, "reward_std": 0.3225876986980438, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2209 }, { "clip_ratio": 0.0, "completion_length": 772.9140625, "epoch": 0.884, "grad_norm": 0.3049512809727972, "kl": 0.145263671875, "learning_rate": 8.086466074476562e-07, "loss": 0.0335, "reward": 2.111328125, "reward_std": 0.3923328295350075, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 2210 }, { "clip_ratio": 0.0, "completion_length": 648.6875, "epoch": 0.8844, "grad_norm": 0.21947509252986133, "kl": 0.13134765625, "learning_rate": 8.031551020346129e-07, "loss": 0.0428, "reward": 2.39453125, "reward_std": 0.24575403332710266, "rewards/accuracy_reward": 0.4453125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2211 }, { "clip_ratio": 0.0, "completion_length": 755.3984375, "epoch": 0.8848, "grad_norm": 0.22969333801852287, "kl": 0.1142578125, "learning_rate": 7.976815263412963e-07, "loss": 0.0644, "reward": 1.96875, "reward_std": 0.29472560435533524, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96875, "step": 2212 }, { "clip_ratio": 0.0, "completion_length": 703.953125, "epoch": 0.8852, "grad_norm": 0.23765195306234904, "kl": 0.1162109375, "learning_rate": 7.922258910387282e-07, "loss": 0.0628, "reward": 2.091796875, "reward_std": 0.3486010432243347, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2213 }, { "clip_ratio": 0.0, "completion_length": 589.84375, "epoch": 0.8856, "grad_norm": 0.5143806478601269, "kl": 0.135498046875, "learning_rate": 7.867882067629473e-07, "loss": 0.1215, "reward": 2.068359375, "reward_std": 0.3738879859447479, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2214 }, { "clip_ratio": 0.0, "completion_length": 732.1328125, "epoch": 0.886, "grad_norm": 0.8638978078579922, "kl": 0.130859375, "learning_rate": 7.81368484114996e-07, "loss": 0.1008, "reward": 2.041015625, "reward_std": 0.30925223231315613, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 2215 }, { "clip_ratio": 0.0, "completion_length": 661.578125, "epoch": 0.8864, "grad_norm": 0.21910142331608995, "kl": 0.127197265625, "learning_rate": 7.759667336609011e-07, "loss": 0.0433, "reward": 1.96484375, "reward_std": 0.21888422966003418, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2216 }, { "clip_ratio": 0.0, "completion_length": 699.53125, "epoch": 0.8868, "grad_norm": 0.26988923567866113, "kl": 0.1182861328125, "learning_rate": 7.7058296593165e-07, "loss": 0.1513, "reward": 2.005859375, "reward_std": 0.3373253494501114, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 2217 }, { "clip_ratio": 0.0, "completion_length": 644.375, "epoch": 0.8872, "grad_norm": 0.27881179642597037, "kl": 0.12158203125, "learning_rate": 7.652171914231777e-07, "loss": 0.0653, "reward": 1.970703125, "reward_std": 0.29127389937639236, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2218 }, { "clip_ratio": 0.0, "completion_length": 630.671875, "epoch": 0.8876, "grad_norm": 0.2895522633055213, "kl": 0.135986328125, "learning_rate": 7.598694205963331e-07, "loss": 0.0477, "reward": 2.265625, "reward_std": 0.2368273288011551, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2219 }, { "clip_ratio": 0.0, "completion_length": 607.9921875, "epoch": 0.888, "grad_norm": 0.2147513552167304, "kl": 0.1348876953125, "learning_rate": 7.545396638768698e-07, "loss": 0.0751, "reward": 1.9609375, "reward_std": 0.2267879769206047, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2220 }, { "clip_ratio": 0.0, "completion_length": 652.203125, "epoch": 0.8884, "grad_norm": 0.279405705690866, "kl": 0.12451171875, "learning_rate": 7.492279316554207e-07, "loss": 0.0637, "reward": 1.9921875, "reward_std": 0.3446475714445114, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96875, "step": 2221 }, { "clip_ratio": 0.0, "completion_length": 746.0078125, "epoch": 0.8888, "grad_norm": 0.41481245141741574, "kl": 0.1243896484375, "learning_rate": 7.439342342874789e-07, "loss": 0.0997, "reward": 1.84765625, "reward_std": 0.38632985204458237, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 2222 }, { "clip_ratio": 0.0, "completion_length": 648.46875, "epoch": 0.8892, "grad_norm": 0.2982505260732196, "kl": 0.142333984375, "learning_rate": 7.386585820933812e-07, "loss": 0.0553, "reward": 2.18359375, "reward_std": 0.23358771204948425, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2223 }, { "clip_ratio": 0.0, "completion_length": 743.90625, "epoch": 0.8896, "grad_norm": 0.21265423531916844, "kl": 0.116943359375, "learning_rate": 7.334009853582791e-07, "loss": 0.027, "reward": 2.115234375, "reward_std": 0.30854610353708267, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.982421875, "step": 2224 }, { "clip_ratio": 0.0, "completion_length": 703.71875, "epoch": 0.89, "grad_norm": 0.3119083046875019, "kl": 0.125732421875, "learning_rate": 7.281614543321269e-07, "loss": 0.0503, "reward": 2.021484375, "reward_std": 0.4855576232075691, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.951171875, "step": 2225 }, { "clip_ratio": 0.0, "completion_length": 740.1640625, "epoch": 0.8904, "grad_norm": 0.4203273818200838, "kl": 0.121337890625, "learning_rate": 7.22939999229657e-07, "loss": 0.0364, "reward": 2.03515625, "reward_std": 0.29153141379356384, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2226 }, { "clip_ratio": 0.0, "completion_length": 730.53125, "epoch": 0.8908, "grad_norm": 0.14952741776724404, "kl": 0.1278076171875, "learning_rate": 7.177366302303667e-07, "loss": 0.0477, "reward": 2.0390625, "reward_std": 0.2744347006082535, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 2227 }, { "clip_ratio": 0.0, "completion_length": 700.734375, "epoch": 0.8912, "grad_norm": 0.24717957661420173, "kl": 0.1195068359375, "learning_rate": 7.125513574784904e-07, "loss": 0.0276, "reward": 2.126953125, "reward_std": 0.22152753919363022, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.986328125, "step": 2228 }, { "clip_ratio": 0.0, "completion_length": 638.78125, "epoch": 0.8916, "grad_norm": 0.5387964074384181, "kl": 0.158447265625, "learning_rate": 7.073841910829771e-07, "loss": 0.0163, "reward": 2.080078125, "reward_std": 0.2738635763525963, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2229 }, { "clip_ratio": 0.0, "completion_length": 639.203125, "epoch": 0.892, "grad_norm": 0.23467127485726155, "kl": 0.129638671875, "learning_rate": 7.022351411174866e-07, "loss": 0.0797, "reward": 1.92578125, "reward_std": 0.2994709014892578, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2230 }, { "clip_ratio": 0.0, "completion_length": 658.5078125, "epoch": 0.8924, "grad_norm": 0.9372380563498743, "kl": 0.135009765625, "learning_rate": 6.971042176203535e-07, "loss": 0.066, "reward": 2.046875, "reward_std": 0.3295811638236046, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2231 }, { "clip_ratio": 0.0, "completion_length": 718.6796875, "epoch": 0.8928, "grad_norm": 0.6099448349643617, "kl": 0.152099609375, "learning_rate": 6.919914305945774e-07, "loss": 0.0554, "reward": 1.939453125, "reward_std": 0.320857509970665, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2232 }, { "clip_ratio": 0.0, "completion_length": 575.9140625, "epoch": 0.8932, "grad_norm": 0.30353035263435785, "kl": 0.1441650390625, "learning_rate": 6.868967900077972e-07, "loss": 0.0942, "reward": 2.1328125, "reward_std": 0.3346090763807297, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2233 }, { "clip_ratio": 0.0, "completion_length": 723.15625, "epoch": 0.8936, "grad_norm": 0.17655279124963963, "kl": 0.1162109375, "learning_rate": 6.818203057922756e-07, "loss": 0.0391, "reward": 1.99609375, "reward_std": 0.23226578533649445, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2234 }, { "clip_ratio": 0.0, "completion_length": 731.8671875, "epoch": 0.894, "grad_norm": 0.1954662002508974, "kl": 0.1195068359375, "learning_rate": 6.767619878448783e-07, "loss": 0.061, "reward": 1.90625, "reward_std": 0.324209988117218, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 2235 }, { "clip_ratio": 0.0, "completion_length": 746.109375, "epoch": 0.8944, "grad_norm": 0.17209105983765868, "kl": 0.1134033203125, "learning_rate": 6.717218460270536e-07, "loss": 0.0408, "reward": 1.982421875, "reward_std": 0.17979396134614944, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2236 }, { "clip_ratio": 0.0, "completion_length": 700.453125, "epoch": 0.8948, "grad_norm": 0.32982437606893683, "kl": 0.138427734375, "learning_rate": 6.666998901648203e-07, "loss": 0.0384, "reward": 1.98046875, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2237 }, { "clip_ratio": 0.0, "completion_length": 722.609375, "epoch": 0.8952, "grad_norm": 0.44335600184896423, "kl": 0.134765625, "learning_rate": 6.616961300487323e-07, "loss": 0.0277, "reward": 2.076171875, "reward_std": 0.2845374867320061, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2238 }, { "clip_ratio": 0.0, "completion_length": 724.5234375, "epoch": 0.8956, "grad_norm": 0.24215569563553546, "kl": 0.1201171875, "learning_rate": 6.567105754338798e-07, "loss": 0.0644, "reward": 1.92578125, "reward_std": 0.32471734285354614, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2239 }, { "clip_ratio": 0.0, "completion_length": 680.890625, "epoch": 0.896, "grad_norm": 0.368493582301777, "kl": 0.1314697265625, "learning_rate": 6.517432360398556e-07, "loss": 0.1405, "reward": 1.970703125, "reward_std": 0.4784712418913841, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.939453125, "step": 2240 }, { "clip_ratio": 0.0, "completion_length": 633.5703125, "epoch": 0.8964, "grad_norm": 0.2926239786729705, "kl": 0.12646484375, "learning_rate": 6.467941215507434e-07, "loss": 0.0971, "reward": 2.169921875, "reward_std": 0.28857752680778503, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2241 }, { "clip_ratio": 0.0, "completion_length": 705.171875, "epoch": 0.8968, "grad_norm": 0.28127654032995986, "kl": 0.160888671875, "learning_rate": 6.418632416150927e-07, "loss": 0.052, "reward": 1.958984375, "reward_std": 0.1640625, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2242 }, { "clip_ratio": 0.0, "completion_length": 709.359375, "epoch": 0.8972, "grad_norm": 0.2678585500081945, "kl": 0.1207275390625, "learning_rate": 6.369506058459063e-07, "loss": 0.0956, "reward": 1.9765625, "reward_std": 0.48934731632471085, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.9296875, "step": 2243 }, { "clip_ratio": 0.0, "completion_length": 609.1796875, "epoch": 0.8976, "grad_norm": 0.3253710144037178, "kl": 0.1510009765625, "learning_rate": 6.320562238206218e-07, "loss": 0.0821, "reward": 1.974609375, "reward_std": 0.37434501200914383, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 2244 }, { "clip_ratio": 0.0, "completion_length": 612.0703125, "epoch": 0.898, "grad_norm": 0.32563597734365796, "kl": 0.12255859375, "learning_rate": 6.271801050810856e-07, "loss": 0.054, "reward": 2.05078125, "reward_std": 0.41550329327583313, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2245 }, { "clip_ratio": 0.0, "completion_length": 720.4375, "epoch": 0.8984, "grad_norm": 0.4314764887296824, "kl": 0.130615234375, "learning_rate": 6.223222591335409e-07, "loss": 0.0965, "reward": 1.98046875, "reward_std": 0.5050966739654541, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.93359375, "step": 2246 }, { "clip_ratio": 0.0, "completion_length": 612.1328125, "epoch": 0.8988, "grad_norm": 0.3339005745313703, "kl": 0.1417236328125, "learning_rate": 6.174826954486069e-07, "loss": 0.1401, "reward": 1.912109375, "reward_std": 0.31690485030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2247 }, { "clip_ratio": 0.0, "completion_length": 719.6875, "epoch": 0.8992, "grad_norm": 0.2538464710625526, "kl": 0.126708984375, "learning_rate": 6.126614234612593e-07, "loss": 0.0671, "reward": 1.9375, "reward_std": 0.37451915442943573, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 2248 }, { "clip_ratio": 0.0, "completion_length": 746.0, "epoch": 0.8996, "grad_norm": 0.22663240050791977, "kl": 0.1240234375, "learning_rate": 6.078584525708175e-07, "loss": 0.0342, "reward": 2.12890625, "reward_std": 0.16527669876813889, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2249 }, { "clip_ratio": 0.0, "completion_length": 712.265625, "epoch": 0.9, "grad_norm": 0.1612311216106911, "kl": 0.1153564453125, "learning_rate": 6.030737921409169e-07, "loss": 0.0219, "reward": 2.03125, "reward_std": 0.27503084391355515, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 657.328125, "epoch": 0.9004, "grad_norm": 0.7202444124412973, "kl": 0.140869140625, "learning_rate": 5.98307451499498e-07, "loss": 0.1293, "reward": 2.146484375, "reward_std": 0.3720213323831558, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 2251 }, { "clip_ratio": 0.0, "completion_length": 782.34375, "epoch": 0.9008, "grad_norm": 0.24993971971133996, "kl": 0.125244140625, "learning_rate": 5.935594399387856e-07, "loss": 0.057, "reward": 2.1484375, "reward_std": 0.3703627809882164, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 2252 }, { "clip_ratio": 0.0, "completion_length": 635.8984375, "epoch": 0.9012, "grad_norm": 0.3059799490087079, "kl": 0.1341552734375, "learning_rate": 5.888297667152731e-07, "loss": 0.1271, "reward": 1.8984375, "reward_std": 0.3023856207728386, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2253 }, { "clip_ratio": 0.0, "completion_length": 632.8046875, "epoch": 0.9016, "grad_norm": 0.3227732419284049, "kl": 0.148193359375, "learning_rate": 5.841184410496992e-07, "loss": 0.1368, "reward": 2.017578125, "reward_std": 0.4261329397559166, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.962890625, "step": 2254 }, { "clip_ratio": 0.0, "completion_length": 763.09375, "epoch": 0.902, "grad_norm": 0.2599405806966546, "kl": 0.1309814453125, "learning_rate": 5.794254721270331e-07, "loss": 0.0568, "reward": 1.923828125, "reward_std": 0.3184715062379837, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2255 }, { "clip_ratio": 0.0, "completion_length": 750.9375, "epoch": 0.9024, "grad_norm": 0.28155286363716403, "kl": 0.1346435546875, "learning_rate": 5.747508690964599e-07, "loss": 0.0582, "reward": 1.921875, "reward_std": 0.36358095705509186, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2256 }, { "clip_ratio": 0.0, "completion_length": 709.5390625, "epoch": 0.9028, "grad_norm": 0.32078554694269357, "kl": 0.114501953125, "learning_rate": 5.700946410713548e-07, "loss": 0.0315, "reward": 2.0078125, "reward_std": 0.3039911389350891, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2257 }, { "clip_ratio": 0.0, "completion_length": 698.5859375, "epoch": 0.9032, "grad_norm": 0.7616583903495276, "kl": 0.11962890625, "learning_rate": 5.654567971292757e-07, "loss": 0.0817, "reward": 1.921875, "reward_std": 0.37528686225414276, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2258 }, { "clip_ratio": 0.0, "completion_length": 692.53125, "epoch": 0.9036, "grad_norm": 0.21690298158911242, "kl": 0.123291015625, "learning_rate": 5.608373463119354e-07, "loss": 0.0599, "reward": 2.087890625, "reward_std": 0.3173172026872635, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2259 }, { "clip_ratio": 0.0, "completion_length": 702.65625, "epoch": 0.904, "grad_norm": 0.5984739273958922, "kl": 0.1390380859375, "learning_rate": 5.562362976251901e-07, "loss": 0.0863, "reward": 2.2421875, "reward_std": 0.43318019807338715, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 2260 }, { "clip_ratio": 0.0, "completion_length": 623.7578125, "epoch": 0.9044, "grad_norm": 0.4880750648182626, "kl": 0.133544921875, "learning_rate": 5.516536600390188e-07, "loss": 0.0525, "reward": 2.080078125, "reward_std": 0.37006811052560806, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2261 }, { "clip_ratio": 0.0, "completion_length": 730.3203125, "epoch": 0.9048, "grad_norm": 2.139779929008058, "kl": 0.1728515625, "learning_rate": 5.470894424875062e-07, "loss": 0.0523, "reward": 1.822265625, "reward_std": 0.1717730015516281, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.923828125, "step": 2262 }, { "clip_ratio": 0.0, "completion_length": 598.7109375, "epoch": 0.9052, "grad_norm": 0.27755851607325865, "kl": 0.124755859375, "learning_rate": 5.425436538688322e-07, "loss": 0.0318, "reward": 2.1640625, "reward_std": 0.20914089679718018, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2263 }, { "clip_ratio": 0.0, "completion_length": 683.9765625, "epoch": 0.9056, "grad_norm": 0.6647766664428163, "kl": 0.14501953125, "learning_rate": 5.380163030452412e-07, "loss": 0.0679, "reward": 2.060546875, "reward_std": 0.3014140874147415, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2264 }, { "clip_ratio": 0.0, "completion_length": 695.078125, "epoch": 0.906, "grad_norm": 0.31080978869748377, "kl": 0.124267578125, "learning_rate": 5.335073988430373e-07, "loss": 0.0394, "reward": 2.142578125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.986328125, "step": 2265 }, { "clip_ratio": 0.0, "completion_length": 690.90625, "epoch": 0.9064, "grad_norm": 0.20823262275739624, "kl": 0.1146240234375, "learning_rate": 5.290169500525577e-07, "loss": 0.0206, "reward": 2.15234375, "reward_std": 0.20517472177743912, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2266 }, { "clip_ratio": 0.0, "completion_length": 691.1796875, "epoch": 0.9068, "grad_norm": 0.2777676953808447, "kl": 0.128662109375, "learning_rate": 5.245449654281632e-07, "loss": 0.0229, "reward": 2.162109375, "reward_std": 0.3101024180650711, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 2267 }, { "clip_ratio": 0.0, "completion_length": 690.125, "epoch": 0.9072, "grad_norm": 0.22068811813751252, "kl": 0.126708984375, "learning_rate": 5.200914536882184e-07, "loss": 0.0201, "reward": 2.134765625, "reward_std": 0.1038404181599617, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2268 }, { "clip_ratio": 0.0, "completion_length": 712.390625, "epoch": 0.9076, "grad_norm": 0.46608618035678634, "kl": 0.12109375, "learning_rate": 5.156564235150686e-07, "loss": 0.0777, "reward": 1.93359375, "reward_std": 0.29892291128635406, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96484375, "step": 2269 }, { "clip_ratio": 0.0, "completion_length": 582.8046875, "epoch": 0.908, "grad_norm": 0.35123094988998327, "kl": 0.1279296875, "learning_rate": 5.112398835550348e-07, "loss": 0.0281, "reward": 2.080078125, "reward_std": 0.36654822528362274, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2270 }, { "clip_ratio": 0.0, "completion_length": 694.890625, "epoch": 0.9084, "grad_norm": 0.6146636982172291, "kl": 0.1292724609375, "learning_rate": 5.068418424183874e-07, "loss": 0.0957, "reward": 2.0703125, "reward_std": 0.4144870862364769, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2271 }, { "clip_ratio": 0.0, "completion_length": 798.453125, "epoch": 0.9088, "grad_norm": 0.7909537769372625, "kl": 0.1226806640625, "learning_rate": 5.024623086793323e-07, "loss": 0.0227, "reward": 2.013671875, "reward_std": 0.22810593992471695, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2272 }, { "clip_ratio": 0.0, "completion_length": 817.4765625, "epoch": 0.9092, "grad_norm": 0.20966475034293974, "kl": 0.116455078125, "learning_rate": 4.981012908759941e-07, "loss": 0.0426, "reward": 2.021484375, "reward_std": 0.2878025621175766, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.974609375, "step": 2273 }, { "clip_ratio": 0.0, "completion_length": 693.2109375, "epoch": 0.9096, "grad_norm": 16.897551969768468, "kl": 0.232421875, "learning_rate": 4.937587975103997e-07, "loss": 0.0125, "reward": 2.087890625, "reward_std": 0.22905874252319336, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2274 }, { "clip_ratio": 0.0, "completion_length": 695.5390625, "epoch": 0.91, "grad_norm": 0.25324624273237706, "kl": 0.115234375, "learning_rate": 4.894348370484648e-07, "loss": 0.0724, "reward": 2.197265625, "reward_std": 0.2447395622730255, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2275 }, { "clip_ratio": 0.0, "completion_length": 722.7578125, "epoch": 0.9104, "grad_norm": 0.23609428879551245, "kl": 0.12939453125, "learning_rate": 4.851294179199673e-07, "loss": 0.0062, "reward": 2.01953125, "reward_std": 0.1285141110420227, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.99609375, "step": 2276 }, { "clip_ratio": 0.0, "completion_length": 673.203125, "epoch": 0.9108, "grad_norm": 0.24981670387394664, "kl": 0.1265869140625, "learning_rate": 4.808425485185486e-07, "loss": 0.072, "reward": 2.017578125, "reward_std": 0.20752985030412674, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.986328125, "step": 2277 }, { "clip_ratio": 0.0, "completion_length": 765.4296875, "epoch": 0.9112, "grad_norm": 0.2500061321048382, "kl": 0.115966796875, "learning_rate": 4.765742372016735e-07, "loss": 0.0739, "reward": 1.94140625, "reward_std": 0.3953908234834671, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 2278 }, { "clip_ratio": 0.0, "completion_length": 795.4453125, "epoch": 0.9116, "grad_norm": 0.2847158383909135, "kl": 0.1334228515625, "learning_rate": 4.723244922906356e-07, "loss": 0.0273, "reward": 2.09375, "reward_std": 0.20544016361236572, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2279 }, { "clip_ratio": 0.0, "completion_length": 686.7734375, "epoch": 0.912, "grad_norm": 0.2749953056437596, "kl": 0.1292724609375, "learning_rate": 4.6809332207053083e-07, "loss": 0.024, "reward": 2.005859375, "reward_std": 0.24780654162168503, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2280 }, { "clip_ratio": 0.0, "completion_length": 691.1328125, "epoch": 0.9124, "grad_norm": 0.5519914190269913, "kl": 0.13330078125, "learning_rate": 4.638807347902408e-07, "loss": 0.1088, "reward": 2.03515625, "reward_std": 0.36503424495458603, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2281 }, { "clip_ratio": 0.0, "completion_length": 711.484375, "epoch": 0.9128, "grad_norm": 0.28829439898586784, "kl": 0.1318359375, "learning_rate": 4.596867386624215e-07, "loss": 0.0755, "reward": 1.900390625, "reward_std": 0.3113015666604042, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 2282 }, { "clip_ratio": 0.0, "completion_length": 655.9765625, "epoch": 0.9132, "grad_norm": 0.6525717063717054, "kl": 0.12548828125, "learning_rate": 4.5551134186348045e-07, "loss": 0.1284, "reward": 2.09765625, "reward_std": 0.38271933048963547, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2283 }, { "clip_ratio": 0.0, "completion_length": 743.2265625, "epoch": 0.9136, "grad_norm": 0.530119296360817, "kl": 0.126953125, "learning_rate": 4.5135455253357053e-07, "loss": 0.0824, "reward": 2.150390625, "reward_std": 0.4709164574742317, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2284 }, { "clip_ratio": 0.0, "completion_length": 684.3125, "epoch": 0.914, "grad_norm": 0.2572675981591814, "kl": 0.1219482421875, "learning_rate": 4.4721637877656377e-07, "loss": 0.0212, "reward": 2.14453125, "reward_std": 0.19311904907226562, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2285 }, { "clip_ratio": 0.0, "completion_length": 745.1328125, "epoch": 0.9144, "grad_norm": 1.3040727279972806, "kl": 0.106689453125, "learning_rate": 4.4309682866004124e-07, "loss": 0.0746, "reward": 2.12890625, "reward_std": 0.46914589405059814, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2286 }, { "clip_ratio": 0.0, "completion_length": 721.96875, "epoch": 0.9148, "grad_norm": 0.24061309428210884, "kl": 0.121826171875, "learning_rate": 4.3899591021527743e-07, "loss": 0.0731, "reward": 2.30859375, "reward_std": 0.4271741062402725, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.95703125, "step": 2287 }, { "clip_ratio": 0.0, "completion_length": 720.8046875, "epoch": 0.9152, "grad_norm": 0.21903688143256733, "kl": 0.1319580078125, "learning_rate": 4.349136314372204e-07, "loss": 0.048, "reward": 2.01953125, "reward_std": 0.25661925226449966, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2288 }, { "clip_ratio": 0.0, "completion_length": 734.703125, "epoch": 0.9156, "grad_norm": 0.19767003927328455, "kl": 0.14111328125, "learning_rate": 4.308500002844862e-07, "loss": 0.046, "reward": 2.1015625, "reward_std": 0.25296996533870697, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2289 }, { "clip_ratio": 0.0, "completion_length": 700.390625, "epoch": 0.916, "grad_norm": 0.17371260823684864, "kl": 0.126953125, "learning_rate": 4.268050246793276e-07, "loss": 0.0381, "reward": 1.94921875, "reward_std": 0.21762458235025406, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2290 }, { "clip_ratio": 0.0, "completion_length": 701.8046875, "epoch": 0.9164, "grad_norm": 0.3486976987116134, "kl": 0.1273193359375, "learning_rate": 4.2277871250763327e-07, "loss": 0.0295, "reward": 1.98046875, "reward_std": 0.140625, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2291 }, { "clip_ratio": 0.0, "completion_length": 710.140625, "epoch": 0.9168, "grad_norm": 0.265794421555441, "kl": 0.118408203125, "learning_rate": 4.1877107161890416e-07, "loss": 0.0191, "reward": 2.26171875, "reward_std": 0.37602952122688293, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.97265625, "step": 2292 }, { "clip_ratio": 0.0, "completion_length": 695.71875, "epoch": 0.9172, "grad_norm": 0.4272380373777636, "kl": 0.134765625, "learning_rate": 4.1478210982624055e-07, "loss": 0.0955, "reward": 2.091796875, "reward_std": 0.3508106395602226, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2293 }, { "clip_ratio": 0.0, "completion_length": 714.4609375, "epoch": 0.9176, "grad_norm": 0.1565822702486342, "kl": 0.121826171875, "learning_rate": 4.108118349063306e-07, "loss": 0.0145, "reward": 2.09375, "reward_std": 0.16997354477643967, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2294 }, { "clip_ratio": 0.0, "completion_length": 700.625, "epoch": 0.918, "grad_norm": 0.5323305241509849, "kl": 0.13134765625, "learning_rate": 4.068602545994249e-07, "loss": 0.1576, "reward": 1.888671875, "reward_std": 0.49430837482213974, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.935546875, "step": 2295 }, { "clip_ratio": 0.0, "completion_length": 694.2734375, "epoch": 0.9184, "grad_norm": 0.12754773660539312, "kl": 0.12841796875, "learning_rate": 4.0292737660933335e-07, "loss": 0.0179, "reward": 1.984375, "reward_std": 0.09630206227302551, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2296 }, { "clip_ratio": 0.0, "completion_length": 642.03125, "epoch": 0.9188, "grad_norm": 0.2932397277308278, "kl": 0.13330078125, "learning_rate": 3.990132086034026e-07, "loss": 0.0061, "reward": 2.01953125, "reward_std": 0.19664455205202103, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2297 }, { "clip_ratio": 0.0, "completion_length": 656.765625, "epoch": 0.9192, "grad_norm": 0.2894933001409766, "kl": 0.134521484375, "learning_rate": 3.9511775821250206e-07, "loss": 0.0503, "reward": 1.947265625, "reward_std": 0.2132154181599617, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2298 }, { "clip_ratio": 0.0, "completion_length": 697.890625, "epoch": 0.9196, "grad_norm": 0.3005002024731094, "kl": 0.1239013671875, "learning_rate": 3.912410330310157e-07, "loss": 0.0568, "reward": 2.18359375, "reward_std": 0.31162478029727936, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.96484375, "step": 2299 }, { "clip_ratio": 0.0, "completion_length": 656.265625, "epoch": 0.92, "grad_norm": 0.24347209656852487, "kl": 0.1268310546875, "learning_rate": 3.8738304061681107e-07, "loss": 0.1136, "reward": 1.939453125, "reward_std": 0.27002984285354614, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2300 }, { "clip_ratio": 0.0, "completion_length": 664.7734375, "epoch": 0.9204, "grad_norm": 0.6144660672080867, "kl": 0.158203125, "learning_rate": 3.835437884912474e-07, "loss": 0.1335, "reward": 2.1875, "reward_std": 0.49622008204460144, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.921875, "step": 2301 }, { "clip_ratio": 0.0, "completion_length": 687.421875, "epoch": 0.9208, "grad_norm": 0.5928181312147887, "kl": 0.1441650390625, "learning_rate": 3.7972328413914074e-07, "loss": 0.0712, "reward": 1.943359375, "reward_std": 0.33893805742263794, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2302 }, { "clip_ratio": 0.0, "completion_length": 808.7734375, "epoch": 0.9212, "grad_norm": 0.2865229725430978, "kl": 0.141357421875, "learning_rate": 3.759215350087619e-07, "loss": 0.0274, "reward": 2.080078125, "reward_std": 0.45746994763612747, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2303 }, { "clip_ratio": 0.0, "completion_length": 674.25, "epoch": 0.9216, "grad_norm": 1.2005832163546524, "kl": 0.127685546875, "learning_rate": 3.721385485118123e-07, "loss": 0.0114, "reward": 2.197265625, "reward_std": 0.2352311834692955, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2304 }, { "clip_ratio": 0.0, "completion_length": 720.0703125, "epoch": 0.922, "grad_norm": 2.457875742260647, "kl": 0.14208984375, "learning_rate": 3.68374332023419e-07, "loss": 0.121, "reward": 1.974609375, "reward_std": 0.5152471959590912, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.951171875, "step": 2305 }, { "clip_ratio": 0.0, "completion_length": 626.6796875, "epoch": 0.9224, "grad_norm": 0.4149368248247992, "kl": 0.140625, "learning_rate": 3.646288928821151e-07, "loss": 0.1109, "reward": 1.931640625, "reward_std": 0.32671576738357544, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2306 }, { "clip_ratio": 0.0, "completion_length": 674.1640625, "epoch": 0.9228, "grad_norm": 0.20321187966767257, "kl": 0.1240234375, "learning_rate": 3.609022383898242e-07, "loss": 0.0594, "reward": 2.173828125, "reward_std": 0.2532695084810257, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 2307 }, { "clip_ratio": 0.0, "completion_length": 728.03125, "epoch": 0.9232, "grad_norm": 0.8742208305625082, "kl": 0.1358642578125, "learning_rate": 3.571943758118546e-07, "loss": 0.1296, "reward": 1.830078125, "reward_std": 0.5331171154975891, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.8046875, "rewards/tag_count_reward": 0.853515625, "step": 2308 }, { "clip_ratio": 0.0, "completion_length": 735.2578125, "epoch": 0.9236, "grad_norm": 0.6470355514112118, "kl": 0.135009765625, "learning_rate": 3.5350531237686723e-07, "loss": 0.0654, "reward": 1.984375, "reward_std": 0.3826117068529129, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 2309 }, { "clip_ratio": 0.0, "completion_length": 647.4375, "epoch": 0.924, "grad_norm": 0.4149382291337426, "kl": 0.1312255859375, "learning_rate": 3.498350552768859e-07, "loss": 0.06, "reward": 1.951171875, "reward_std": 0.26696794480085373, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2310 }, { "clip_ratio": 0.0, "completion_length": 671.2890625, "epoch": 0.9244, "grad_norm": 0.6958573940421079, "kl": 0.127685546875, "learning_rate": 3.4618361166726123e-07, "loss": 0.0637, "reward": 2.095703125, "reward_std": 0.3259315490722656, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2311 }, { "clip_ratio": 0.0, "completion_length": 700.4296875, "epoch": 0.9248, "grad_norm": 0.2916102831421895, "kl": 0.144287109375, "learning_rate": 3.4255098866667114e-07, "loss": 0.068, "reward": 2.033203125, "reward_std": 0.30383190512657166, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2312 }, { "clip_ratio": 0.0, "completion_length": 777.90625, "epoch": 0.9252, "grad_norm": 0.513302442591613, "kl": 0.1370849609375, "learning_rate": 3.3893719335709953e-07, "loss": 0.05, "reward": 2.0390625, "reward_std": 0.3765643760561943, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 2313 }, { "clip_ratio": 0.0, "completion_length": 748.5703125, "epoch": 0.9256, "grad_norm": 0.21347705636801595, "kl": 0.133544921875, "learning_rate": 3.3534223278382405e-07, "loss": 0.1176, "reward": 1.822265625, "reward_std": 0.45720867812633514, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.900390625, "step": 2314 }, { "clip_ratio": 0.0, "completion_length": 726.4140625, "epoch": 0.926, "grad_norm": 0.37993120247259693, "kl": 0.1231689453125, "learning_rate": 3.3176611395540625e-07, "loss": 0.1139, "reward": 2.00390625, "reward_std": 0.35766202211380005, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94140625, "step": 2315 }, { "clip_ratio": 0.0, "completion_length": 691.6328125, "epoch": 0.9264, "grad_norm": 0.2767457730942031, "kl": 0.13427734375, "learning_rate": 3.282088438436715e-07, "loss": 0.1014, "reward": 2.03125, "reward_std": 0.3521539568901062, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.9765625, "step": 2316 }, { "clip_ratio": 0.0, "completion_length": 604.90625, "epoch": 0.9268, "grad_norm": 0.43216372728902575, "kl": 0.129150390625, "learning_rate": 3.246704293837011e-07, "loss": 0.0142, "reward": 2.416015625, "reward_std": 0.2883000895380974, "rewards/accuracy_reward": 0.4296875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2317 }, { "clip_ratio": 0.0, "completion_length": 649.078125, "epoch": 0.9272, "grad_norm": 0.1700257697196945, "kl": 0.121337890625, "learning_rate": 3.211508774738137e-07, "loss": 0.0539, "reward": 2.080078125, "reward_std": 0.1796875, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.978515625, "step": 2318 }, { "clip_ratio": 0.0, "completion_length": 621.2109375, "epoch": 0.9276, "grad_norm": 0.6808715771417184, "kl": 0.1246337890625, "learning_rate": 3.1765019497555617e-07, "loss": 0.0577, "reward": 2.091796875, "reward_std": 0.3168194964528084, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2319 }, { "clip_ratio": 0.0, "completion_length": 712.3984375, "epoch": 0.928, "grad_norm": 0.1787075944677207, "kl": 0.1209716796875, "learning_rate": 3.1416838871368925e-07, "loss": 0.0747, "reward": 2.021484375, "reward_std": 0.2966759651899338, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.951171875, "step": 2320 }, { "clip_ratio": 0.0, "completion_length": 752.0546875, "epoch": 0.9284, "grad_norm": 0.41155729615041514, "kl": 0.12109375, "learning_rate": 3.10705465476171e-07, "loss": 0.0051, "reward": 2.0859375, "reward_std": 0.16219250857830048, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 1.0, "rewards/tag_count_reward": 1.0, "step": 2321 }, { "clip_ratio": 0.0, "completion_length": 725.734375, "epoch": 0.9288, "grad_norm": 0.30228538610808087, "kl": 0.139404296875, "learning_rate": 3.072614320141487e-07, "loss": 0.0581, "reward": 1.97265625, "reward_std": 0.33222130686044693, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2322 }, { "clip_ratio": 0.0, "completion_length": 664.0390625, "epoch": 0.9292, "grad_norm": 0.3232685824250571, "kl": 0.147216796875, "learning_rate": 3.0383629504194047e-07, "loss": 0.0532, "reward": 2.17578125, "reward_std": 0.20466843992471695, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2323 }, { "clip_ratio": 0.0, "completion_length": 708.796875, "epoch": 0.9296, "grad_norm": 0.15062555042008324, "kl": 0.115966796875, "learning_rate": 3.00430061237027e-07, "loss": 0.0656, "reward": 1.955078125, "reward_std": 0.22825969010591507, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2324 }, { "clip_ratio": 0.0, "completion_length": 633.65625, "epoch": 0.93, "grad_norm": 0.1512820013395415, "kl": 0.123046875, "learning_rate": 2.970427372400353e-07, "loss": 0.0266, "reward": 2.201171875, "reward_std": 0.1953125, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2325 }, { "clip_ratio": 0.0, "completion_length": 739.875, "epoch": 0.9304, "grad_norm": 0.21406357266499615, "kl": 0.148193359375, "learning_rate": 2.936743296547273e-07, "loss": 0.0896, "reward": 2.015625, "reward_std": 0.36891815811395645, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 2326 }, { "clip_ratio": 0.0, "completion_length": 782.0, "epoch": 0.9308, "grad_norm": 0.20127906861817071, "kl": 0.1142578125, "learning_rate": 2.9032484504798454e-07, "loss": 0.0651, "reward": 2.01171875, "reward_std": 0.5157964676618576, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94140625, "step": 2327 }, { "clip_ratio": 0.0, "completion_length": 804.625, "epoch": 0.9312, "grad_norm": 0.5845202399038341, "kl": 0.12109375, "learning_rate": 2.8699428994980017e-07, "loss": 0.0676, "reward": 1.90234375, "reward_std": 0.40592119097709656, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 2328 }, { "clip_ratio": 0.0, "completion_length": 822.015625, "epoch": 0.9316, "grad_norm": 0.45760288928693865, "kl": 0.1436767578125, "learning_rate": 2.836826708532603e-07, "loss": 0.0369, "reward": 2.0234375, "reward_std": 0.24659234285354614, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2329 }, { "clip_ratio": 0.0, "completion_length": 648.3828125, "epoch": 0.932, "grad_norm": 0.23602703611963588, "kl": 0.1142578125, "learning_rate": 2.8038999421453827e-07, "loss": 0.0759, "reward": 2.146484375, "reward_std": 0.32353654503822327, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2330 }, { "clip_ratio": 0.0, "completion_length": 783.4375, "epoch": 0.9324, "grad_norm": 0.2661117815707096, "kl": 0.12060546875, "learning_rate": 2.771162664528726e-07, "loss": 0.0318, "reward": 2.078125, "reward_std": 0.34414271265268326, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2331 }, { "clip_ratio": 0.0, "completion_length": 736.890625, "epoch": 0.9328, "grad_norm": 0.1625451675197373, "kl": 0.1148681640625, "learning_rate": 2.7386149395056463e-07, "loss": 0.077, "reward": 2.013671875, "reward_std": 0.2874603271484375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.951171875, "step": 2332 }, { "clip_ratio": 0.0, "completion_length": 732.1640625, "epoch": 0.9332, "grad_norm": 0.31889521912035235, "kl": 0.1285400390625, "learning_rate": 2.7062568305295967e-07, "loss": 0.0349, "reward": 2.177734375, "reward_std": 0.25289584696292877, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.990234375, "step": 2333 }, { "clip_ratio": 0.0, "completion_length": 724.1484375, "epoch": 0.9336, "grad_norm": 121.19698139296473, "kl": 0.5029296875, "learning_rate": 2.6740884006843826e-07, "loss": 0.057, "reward": 2.072265625, "reward_std": 0.3590550571680069, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2334 }, { "clip_ratio": 0.0, "completion_length": 688.609375, "epoch": 0.934, "grad_norm": 1.1001919499089858, "kl": 0.18603515625, "learning_rate": 2.6421097126839714e-07, "loss": 0.1124, "reward": 2.201171875, "reward_std": 0.39912249147892, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 2335 }, { "clip_ratio": 0.0, "completion_length": 711.90625, "epoch": 0.9344, "grad_norm": 0.6875579286701271, "kl": 0.130615234375, "learning_rate": 2.6103208288724815e-07, "loss": 0.0243, "reward": 2.14453125, "reward_std": 0.1812114119529724, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2336 }, { "clip_ratio": 0.0, "completion_length": 682.8515625, "epoch": 0.9348, "grad_norm": 0.36880539867272605, "kl": 0.139404296875, "learning_rate": 2.57872181122395e-07, "loss": 0.1011, "reward": 2.0703125, "reward_std": 0.4659320116043091, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2337 }, { "clip_ratio": 0.0, "completion_length": 732.1484375, "epoch": 0.9352, "grad_norm": 0.30671852768001245, "kl": 0.1114501953125, "learning_rate": 2.547312721342277e-07, "loss": 0.0566, "reward": 1.96484375, "reward_std": 0.2157595381140709, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2338 }, { "clip_ratio": 0.0, "completion_length": 716.578125, "epoch": 0.9356, "grad_norm": 2.0454904983076094, "kl": 0.128173828125, "learning_rate": 2.516093620461124e-07, "loss": 0.0683, "reward": 1.93359375, "reward_std": 0.41749949008226395, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2339 }, { "clip_ratio": 0.0, "completion_length": 634.515625, "epoch": 0.936, "grad_norm": 0.28305474393765445, "kl": 0.1219482421875, "learning_rate": 2.4850645694436736e-07, "loss": 0.0489, "reward": 2.103515625, "reward_std": 0.24565474689006805, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.978515625, "step": 2340 }, { "clip_ratio": 0.0, "completion_length": 689.109375, "epoch": 0.9364, "grad_norm": 0.29460558027772177, "kl": 0.126953125, "learning_rate": 2.4542256287826915e-07, "loss": 0.0354, "reward": 2.1328125, "reward_std": 0.2530967593193054, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2341 }, { "clip_ratio": 0.0, "completion_length": 648.2578125, "epoch": 0.9368, "grad_norm": 1.5925862418072023, "kl": 0.1353759765625, "learning_rate": 2.423576858600252e-07, "loss": 0.136, "reward": 2.140625, "reward_std": 0.34529343992471695, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2342 }, { "clip_ratio": 0.0, "completion_length": 762.453125, "epoch": 0.9372, "grad_norm": 0.33990790721054986, "kl": 0.11376953125, "learning_rate": 2.3931183186477026e-07, "loss": 0.0603, "reward": 2.091796875, "reward_std": 0.36688994616270065, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.958984375, "step": 2343 }, { "clip_ratio": 0.0, "completion_length": 687.3515625, "epoch": 0.9376, "grad_norm": 0.42001660810216496, "kl": 0.16162109375, "learning_rate": 2.3628500683055222e-07, "loss": 0.1088, "reward": 1.84375, "reward_std": 0.4341566264629364, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 2344 }, { "clip_ratio": 0.0, "completion_length": 723.4609375, "epoch": 0.938, "grad_norm": 0.3030680946958436, "kl": 0.1202392578125, "learning_rate": 2.332772166583208e-07, "loss": 0.0498, "reward": 1.919921875, "reward_std": 0.24289856851100922, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2345 }, { "clip_ratio": 0.0, "completion_length": 773.0703125, "epoch": 0.9384, "grad_norm": 1.0244496662495066, "kl": 0.144287109375, "learning_rate": 2.3028846721191878e-07, "loss": 0.0869, "reward": 1.71484375, "reward_std": 0.4417106807231903, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.8125, "rewards/tag_count_reward": 0.87109375, "step": 2346 }, { "clip_ratio": 0.0, "completion_length": 709.140625, "epoch": 0.9388, "grad_norm": 0.4294867418743285, "kl": 0.138671875, "learning_rate": 2.273187643180652e-07, "loss": 0.0868, "reward": 1.9609375, "reward_std": 0.3701956570148468, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.9609375, "step": 2347 }, { "clip_ratio": 0.0, "completion_length": 708.609375, "epoch": 0.9392, "grad_norm": 17.232665009031376, "kl": 4.781982421875, "learning_rate": 2.2436811376634893e-07, "loss": 0.2992, "reward": 2.05078125, "reward_std": 0.2623259201645851, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2348 }, { "clip_ratio": 0.0, "completion_length": 766.8515625, "epoch": 0.9396, "grad_norm": 0.3008145699509498, "kl": 0.115966796875, "learning_rate": 2.214365213092118e-07, "loss": 0.0192, "reward": 1.98828125, "reward_std": 0.3526473492383957, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2349 }, { "clip_ratio": 0.0, "completion_length": 683.6875, "epoch": 0.94, "grad_norm": 0.34768131725978446, "kl": 0.12109375, "learning_rate": 2.1852399266194312e-07, "loss": 0.1, "reward": 2.16796875, "reward_std": 0.48225925117731094, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 2350 }, { "clip_ratio": 0.0, "completion_length": 726.203125, "epoch": 0.9404, "grad_norm": 0.3797985091975591, "kl": 0.138671875, "learning_rate": 2.1563053350266983e-07, "loss": 0.0391, "reward": 2.19921875, "reward_std": 0.16846735030412674, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2351 }, { "clip_ratio": 0.0, "completion_length": 666.3203125, "epoch": 0.9408, "grad_norm": 0.30711450117697925, "kl": 0.1259765625, "learning_rate": 2.1275614947233624e-07, "loss": 0.0696, "reward": 2.029296875, "reward_std": 0.3555542379617691, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 2352 }, { "clip_ratio": 0.0, "completion_length": 673.375, "epoch": 0.9412, "grad_norm": 0.2650918768074016, "kl": 0.135009765625, "learning_rate": 2.0990084617470207e-07, "loss": 0.0632, "reward": 2.1171875, "reward_std": 0.33144959062337875, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2353 }, { "clip_ratio": 0.0, "completion_length": 653.4296875, "epoch": 0.9416, "grad_norm": 0.23119025109172903, "kl": 0.13720703125, "learning_rate": 2.0706462917632676e-07, "loss": 0.1466, "reward": 1.958984375, "reward_std": 0.3107916787266731, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2354 }, { "clip_ratio": 0.0, "completion_length": 662.7265625, "epoch": 0.942, "grad_norm": 1.2769025436266754, "kl": 0.12890625, "learning_rate": 2.0424750400655947e-07, "loss": 0.0853, "reward": 2.123046875, "reward_std": 0.38176267594099045, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.951171875, "step": 2355 }, { "clip_ratio": 0.0, "completion_length": 623.484375, "epoch": 0.9424, "grad_norm": 0.22687359480616398, "kl": 0.1287841796875, "learning_rate": 2.014494761575314e-07, "loss": 0.0767, "reward": 2.013671875, "reward_std": 0.23114747554063797, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2356 }, { "clip_ratio": 0.0, "completion_length": 734.953125, "epoch": 0.9428, "grad_norm": 1.0298216669310338, "kl": 0.13232421875, "learning_rate": 1.9867055108414023e-07, "loss": 0.05, "reward": 2.126953125, "reward_std": 0.3641912266612053, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2357 }, { "clip_ratio": 0.0, "completion_length": 671.2109375, "epoch": 0.9432, "grad_norm": 0.28206804566889404, "kl": 0.134765625, "learning_rate": 1.9591073420404338e-07, "loss": 0.0509, "reward": 2.0390625, "reward_std": 0.1752803698182106, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2358 }, { "clip_ratio": 0.0, "completion_length": 695.6171875, "epoch": 0.9436, "grad_norm": 0.29558370360510366, "kl": 0.137939453125, "learning_rate": 1.9317003089764365e-07, "loss": 0.1444, "reward": 1.931640625, "reward_std": 0.5558075159788132, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.892578125, "step": 2359 }, { "clip_ratio": 0.0, "completion_length": 730.078125, "epoch": 0.944, "grad_norm": 0.24628561304634575, "kl": 0.126220703125, "learning_rate": 1.9044844650808468e-07, "loss": 0.0849, "reward": 2.052734375, "reward_std": 0.3728065490722656, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.966796875, "step": 2360 }, { "clip_ratio": 0.0, "completion_length": 755.96875, "epoch": 0.9444, "grad_norm": 1.3193351293480886, "kl": 0.1236572265625, "learning_rate": 1.877459863412323e-07, "loss": 0.095, "reward": 2.130859375, "reward_std": 0.4742976166307926, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.943359375, "step": 2361 }, { "clip_ratio": 0.0, "completion_length": 690.1484375, "epoch": 0.9448, "grad_norm": 0.22707736535485396, "kl": 0.114013671875, "learning_rate": 1.8506265566567095e-07, "loss": -0.0003, "reward": 2.09375, "reward_std": 0.1948273777961731, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.9921875, "step": 2362 }, { "clip_ratio": 0.0, "completion_length": 806.59375, "epoch": 0.9452, "grad_norm": 0.45414781147500427, "kl": 0.1201171875, "learning_rate": 1.8239845971269266e-07, "loss": 0.0467, "reward": 1.939453125, "reward_std": 0.3974483460187912, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 2363 }, { "clip_ratio": 0.0, "completion_length": 768.8046875, "epoch": 0.9456, "grad_norm": 0.49746866905546117, "kl": 0.141357421875, "learning_rate": 1.7975340367628269e-07, "loss": 0.0429, "reward": 1.89453125, "reward_std": 0.3026595115661621, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.93359375, "step": 2364 }, { "clip_ratio": 0.0, "completion_length": 677.0703125, "epoch": 0.946, "grad_norm": 0.4441498744084522, "kl": 0.125244140625, "learning_rate": 1.7712749271311392e-07, "loss": 0.0323, "reward": 2.134765625, "reward_std": 0.16945012658834457, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2365 }, { "clip_ratio": 0.0, "completion_length": 556.609375, "epoch": 0.9464, "grad_norm": 0.2812391976933948, "kl": 0.126953125, "learning_rate": 1.7452073194253237e-07, "loss": 0.1081, "reward": 2.171875, "reward_std": 0.32392311096191406, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2366 }, { "clip_ratio": 0.0, "completion_length": 784.515625, "epoch": 0.9468, "grad_norm": 0.28593056813011725, "kl": 0.1148681640625, "learning_rate": 1.719331264465529e-07, "loss": 0.0658, "reward": 1.9921875, "reward_std": 0.5583561062812805, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.9453125, "step": 2367 }, { "clip_ratio": 0.0, "completion_length": 696.3515625, "epoch": 0.9472, "grad_norm": 0.34457185538050583, "kl": 0.12548828125, "learning_rate": 1.6936468126984573e-07, "loss": 0.0678, "reward": 1.998046875, "reward_std": 0.3064749464392662, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2368 }, { "clip_ratio": 0.0, "completion_length": 695.71875, "epoch": 0.9476, "grad_norm": 0.2784817848243958, "kl": 0.1336669921875, "learning_rate": 1.668154014197243e-07, "loss": 0.0846, "reward": 2.244140625, "reward_std": 0.2746783122420311, "rewards/accuracy_reward": 0.3359375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2369 }, { "clip_ratio": 0.0, "completion_length": 611.2578125, "epoch": 0.948, "grad_norm": 0.2859776146535088, "kl": 0.123046875, "learning_rate": 1.6428529186614195e-07, "loss": 0.0608, "reward": 2.15234375, "reward_std": 0.26972050219774246, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2370 }, { "clip_ratio": 0.0, "completion_length": 708.90625, "epoch": 0.9484, "grad_norm": 0.2962293023930537, "kl": 0.115966796875, "learning_rate": 1.6177435754167413e-07, "loss": 0.0243, "reward": 1.96484375, "reward_std": 0.2535141110420227, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.98046875, "step": 2371 }, { "clip_ratio": 0.0, "completion_length": 647.375, "epoch": 0.9488, "grad_norm": 0.2632073714088719, "kl": 0.1181640625, "learning_rate": 1.5928260334151847e-07, "loss": 0.0828, "reward": 2.072265625, "reward_std": 0.3784128203988075, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2372 }, { "clip_ratio": 0.0, "completion_length": 740.25, "epoch": 0.9492, "grad_norm": 0.2778906255976223, "kl": 0.1337890625, "learning_rate": 1.5681003412347573e-07, "loss": 0.0571, "reward": 1.939453125, "reward_std": 0.21316029131412506, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2373 }, { "clip_ratio": 0.0, "completion_length": 641.5234375, "epoch": 0.9496, "grad_norm": 0.3455544981832908, "kl": 0.11669921875, "learning_rate": 1.543566547079467e-07, "loss": 0.0513, "reward": 2.064453125, "reward_std": 0.20257875323295593, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.994140625, "step": 2374 }, { "clip_ratio": 0.0, "completion_length": 767.109375, "epoch": 0.95, "grad_norm": 1.1584587635542993, "kl": 0.14892578125, "learning_rate": 1.519224698779198e-07, "loss": 0.045, "reward": 1.92578125, "reward_std": 0.2623259201645851, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2375 }, { "clip_ratio": 0.0, "completion_length": 772.859375, "epoch": 0.9504, "grad_norm": 0.25606421509793126, "kl": 0.1343994140625, "learning_rate": 1.4950748437896235e-07, "loss": 0.0547, "reward": 2.080078125, "reward_std": 0.3991714119911194, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.962890625, "step": 2376 }, { "clip_ratio": 0.0, "completion_length": 680.6171875, "epoch": 0.9508, "grad_norm": 0.4340416833048096, "kl": 0.159423828125, "learning_rate": 1.4711170291921485e-07, "loss": 0.1152, "reward": 1.83203125, "reward_std": 0.37668970972299576, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.8828125, "rewards/tag_count_reward": 0.93359375, "step": 2377 }, { "clip_ratio": 0.0, "completion_length": 760.984375, "epoch": 0.9512, "grad_norm": 0.2898791500331513, "kl": 0.115234375, "learning_rate": 1.4473513016937223e-07, "loss": 0.0601, "reward": 1.892578125, "reward_std": 0.3251463398337364, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 2378 }, { "clip_ratio": 0.0, "completion_length": 715.3984375, "epoch": 0.9516, "grad_norm": 0.4250722486939082, "kl": 0.13134765625, "learning_rate": 1.4237777076268723e-07, "loss": 0.0297, "reward": 1.984375, "reward_std": 0.2771770879626274, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2379 }, { "clip_ratio": 0.0, "completion_length": 716.140625, "epoch": 0.952, "grad_norm": 0.2567877895472001, "kl": 0.1097412109375, "learning_rate": 1.400396292949513e-07, "loss": 0.0455, "reward": 2.005859375, "reward_std": 0.38580841571092606, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 2380 }, { "clip_ratio": 0.0, "completion_length": 777.421875, "epoch": 0.9524, "grad_norm": 0.3041123033255762, "kl": 0.132080078125, "learning_rate": 1.377207103244904e-07, "loss": 0.084, "reward": 1.884765625, "reward_std": 0.4015243798494339, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 2381 }, { "clip_ratio": 0.0, "completion_length": 723.125, "epoch": 0.9528, "grad_norm": 0.21447537494723276, "kl": 0.11962890625, "learning_rate": 1.3542101837215826e-07, "loss": 0.0748, "reward": 1.9609375, "reward_std": 0.2366529181599617, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2382 }, { "clip_ratio": 0.0, "completion_length": 713.5703125, "epoch": 0.9532, "grad_norm": 0.2322254927281692, "kl": 0.14013671875, "learning_rate": 1.3314055792131964e-07, "loss": 0.1358, "reward": 1.9296875, "reward_std": 0.5216133445501328, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.921875, "step": 2383 }, { "clip_ratio": 0.0, "completion_length": 770.125, "epoch": 0.9536, "grad_norm": 0.3806389923373318, "kl": 0.1285400390625, "learning_rate": 1.308793334178493e-07, "loss": 0.0665, "reward": 2.0859375, "reward_std": 0.3878570944070816, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.9609375, "step": 2384 }, { "clip_ratio": 0.0, "completion_length": 734.2734375, "epoch": 0.954, "grad_norm": 0.25919784529478246, "kl": 0.121826171875, "learning_rate": 1.2863734927012094e-07, "loss": 0.0863, "reward": 1.91796875, "reward_std": 0.46267981082201004, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.93359375, "step": 2385 }, { "clip_ratio": 0.0, "completion_length": 759.5625, "epoch": 0.9544, "grad_norm": 0.28520713267948056, "kl": 0.13525390625, "learning_rate": 1.26414609848996e-07, "loss": 0.0395, "reward": 2.060546875, "reward_std": 0.28589920699596405, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2386 }, { "clip_ratio": 0.0, "completion_length": 763.390625, "epoch": 0.9548, "grad_norm": 0.23804746309322164, "kl": 0.115966796875, "learning_rate": 1.242111194878215e-07, "loss": 0.0372, "reward": 2.021484375, "reward_std": 0.24858064949512482, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2387 }, { "clip_ratio": 0.0, "completion_length": 670.5703125, "epoch": 0.9552, "grad_norm": 0.36372246335694547, "kl": 0.1322021484375, "learning_rate": 1.2202688248241113e-07, "loss": 0.0841, "reward": 1.921875, "reward_std": 0.24914440512657166, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2388 }, { "clip_ratio": 0.0, "completion_length": 716.984375, "epoch": 0.9556, "grad_norm": 0.5220560140692145, "kl": 0.12744140625, "learning_rate": 1.1986190309104861e-07, "loss": 0.0922, "reward": 2.2109375, "reward_std": 0.37655720859766006, "rewards/accuracy_reward": 0.3359375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.953125, "step": 2389 }, { "clip_ratio": 0.0, "completion_length": 784.3515625, "epoch": 0.956, "grad_norm": 0.48412094418020424, "kl": 0.1231689453125, "learning_rate": 1.1771618553447217e-07, "loss": 0.0732, "reward": 1.88671875, "reward_std": 0.42548196017742157, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.92578125, "step": 2390 }, { "clip_ratio": 0.0, "completion_length": 698.125, "epoch": 0.9564, "grad_norm": 0.22840462799891462, "kl": 0.12255859375, "learning_rate": 1.1558973399586671e-07, "loss": 0.0714, "reward": 2.068359375, "reward_std": 0.3114965632557869, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2391 }, { "clip_ratio": 0.0, "completion_length": 700.0234375, "epoch": 0.9568, "grad_norm": 0.3522710414515286, "kl": 0.1248779296875, "learning_rate": 1.134825526208605e-07, "loss": 0.0661, "reward": 2.134765625, "reward_std": 0.2932591512799263, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2392 }, { "clip_ratio": 0.0, "completion_length": 691.6171875, "epoch": 0.9572, "grad_norm": 0.2201018966815442, "kl": 0.137939453125, "learning_rate": 1.1139464551750857e-07, "loss": 0.083, "reward": 2.158203125, "reward_std": 0.2978721931576729, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2393 }, { "clip_ratio": 0.0, "completion_length": 703.375, "epoch": 0.9576, "grad_norm": 0.21828278468035756, "kl": 0.123779296875, "learning_rate": 1.0932601675629595e-07, "loss": 0.0601, "reward": 2.16796875, "reward_std": 0.29910366982221603, "rewards/accuracy_reward": 0.2109375, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.98046875, "step": 2394 }, { "clip_ratio": 0.0, "completion_length": 677.84375, "epoch": 0.958, "grad_norm": 0.40769216156302723, "kl": 0.150634765625, "learning_rate": 1.0727667037011668e-07, "loss": 0.036, "reward": 1.966796875, "reward_std": 0.3482285812497139, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 2395 }, { "clip_ratio": 0.0, "completion_length": 673.1953125, "epoch": 0.9584, "grad_norm": 0.34819921017509203, "kl": 0.1435546875, "learning_rate": 1.052466103542793e-07, "loss": 0.0746, "reward": 2.001953125, "reward_std": 0.29063531011343, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2396 }, { "clip_ratio": 0.0, "completion_length": 731.6796875, "epoch": 0.9588, "grad_norm": 0.44170516732244475, "kl": 0.11083984375, "learning_rate": 1.0323584066648795e-07, "loss": 0.0496, "reward": 2.10546875, "reward_std": 0.45074574649333954, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2397 }, { "clip_ratio": 0.0, "completion_length": 778.484375, "epoch": 0.9592, "grad_norm": 0.3431600217234219, "kl": 0.123779296875, "learning_rate": 1.0124436522684244e-07, "loss": 0.0442, "reward": 2.078125, "reward_std": 0.44632577896118164, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9453125, "step": 2398 }, { "clip_ratio": 0.0, "completion_length": 701.734375, "epoch": 0.9596, "grad_norm": 0.6867994995654646, "kl": 0.1265869140625, "learning_rate": 9.9272187917826e-08, "loss": 0.0747, "reward": 2.126953125, "reward_std": 0.3132813274860382, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2399 }, { "clip_ratio": 0.0, "completion_length": 751.46875, "epoch": 0.96, "grad_norm": 0.3268873229010073, "kl": 0.121337890625, "learning_rate": 9.731931258429638e-08, "loss": 0.0457, "reward": 1.921875, "reward_std": 0.33389707654714584, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.9609375, "step": 2400 }, { "clip_ratio": 0.0, "completion_length": 654.296875, "epoch": 0.9604, "grad_norm": 0.38228744108855356, "kl": 0.149658203125, "learning_rate": 9.538574303348813e-08, "loss": 0.0978, "reward": 1.94140625, "reward_std": 0.39466558396816254, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2401 }, { "clip_ratio": 0.0, "completion_length": 705.3671875, "epoch": 0.9608, "grad_norm": 0.27338835538607237, "kl": 0.130126953125, "learning_rate": 9.347148303499143e-08, "loss": 0.0363, "reward": 1.998046875, "reward_std": 0.25248485058546066, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.982421875, "step": 2402 }, { "clip_ratio": 0.0, "completion_length": 745.46875, "epoch": 0.9612, "grad_norm": 0.16937639866953672, "kl": 0.1202392578125, "learning_rate": 9.157653632075435e-08, "loss": 0.0646, "reward": 1.908203125, "reward_std": 0.3194200322031975, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 2403 }, { "clip_ratio": 0.0, "completion_length": 738.3359375, "epoch": 0.9616, "grad_norm": 0.3979582766380055, "kl": 0.14404296875, "learning_rate": 8.970090658507291e-08, "loss": 0.0798, "reward": 1.908203125, "reward_std": 0.38880301266908646, "rewards/accuracy_reward": 0.0234375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 2404 }, { "clip_ratio": 0.0, "completion_length": 751.375, "epoch": 0.962, "grad_norm": 0.24769952846719498, "kl": 0.1173095703125, "learning_rate": 8.784459748458318e-08, "loss": 0.0749, "reward": 1.91796875, "reward_std": 0.34345244616270065, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.95703125, "step": 2405 }, { "clip_ratio": 0.0, "completion_length": 700.2578125, "epoch": 0.9624, "grad_norm": 0.3518630299011668, "kl": 0.12353515625, "learning_rate": 8.600761263825475e-08, "loss": 0.0943, "reward": 1.986328125, "reward_std": 0.4367850199341774, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 2406 }, { "clip_ratio": 0.0, "completion_length": 686.7109375, "epoch": 0.9628, "grad_norm": 0.5118150317573789, "kl": 0.1219482421875, "learning_rate": 8.418995562738286e-08, "loss": 0.093, "reward": 2.087890625, "reward_std": 0.4219738394021988, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.955078125, "step": 2407 }, { "clip_ratio": 0.0, "completion_length": 698.2109375, "epoch": 0.9632, "grad_norm": 0.17941976581961055, "kl": 0.1326904296875, "learning_rate": 8.239162999558403e-08, "loss": 0.0287, "reward": 2.23828125, "reward_std": 0.15207062661647797, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2408 }, { "clip_ratio": 0.0, "completion_length": 748.3671875, "epoch": 0.9636, "grad_norm": 0.2315714105829642, "kl": 0.12548828125, "learning_rate": 8.061263924878604e-08, "loss": 0.0364, "reward": 1.943359375, "reward_std": 0.30047930032014847, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2409 }, { "clip_ratio": 0.0, "completion_length": 732.234375, "epoch": 0.964, "grad_norm": 0.4147002157125339, "kl": 0.136474609375, "learning_rate": 7.885298685522235e-08, "loss": 0.0622, "reward": 2.1796875, "reward_std": 0.3326834365725517, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96875, "step": 2410 }, { "clip_ratio": 0.0, "completion_length": 768.40625, "epoch": 0.9644, "grad_norm": 0.47445397855419735, "kl": 0.188720703125, "learning_rate": 7.71126762454233e-08, "loss": 0.0562, "reward": 1.923828125, "reward_std": 0.3359375, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2411 }, { "clip_ratio": 0.0, "completion_length": 748.9140625, "epoch": 0.9648, "grad_norm": 0.28379200909831953, "kl": 0.13037109375, "learning_rate": 7.539171081221597e-08, "loss": 0.0562, "reward": 2.142578125, "reward_std": 0.41336067020893097, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2412 }, { "clip_ratio": 0.0, "completion_length": 755.6796875, "epoch": 0.9652, "grad_norm": 0.2555630831369971, "kl": 0.131591796875, "learning_rate": 7.369009391070992e-08, "loss": 0.0723, "reward": 2.048828125, "reward_std": 0.364450179040432, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2413 }, { "clip_ratio": 0.0, "completion_length": 701.28125, "epoch": 0.9656, "grad_norm": 0.2857304630575103, "kl": 0.136962890625, "learning_rate": 7.200782885829482e-08, "loss": 0.0441, "reward": 2.1328125, "reward_std": 0.28997454792261124, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2414 }, { "clip_ratio": 0.0, "completion_length": 675.3984375, "epoch": 0.966, "grad_norm": 0.2095024661405846, "kl": 0.119384765625, "learning_rate": 7.034491893463059e-08, "loss": 0.0873, "reward": 2.0703125, "reward_std": 0.2892879694700241, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2415 }, { "clip_ratio": 0.0, "completion_length": 683.53125, "epoch": 0.9664, "grad_norm": 0.15407018991405527, "kl": 0.1182861328125, "learning_rate": 6.870136738164612e-08, "loss": 0.1096, "reward": 2.013671875, "reward_std": 0.3938276022672653, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.943359375, "step": 2416 }, { "clip_ratio": 0.0, "completion_length": 670.796875, "epoch": 0.9668, "grad_norm": 0.7066138067001865, "kl": 0.1241455078125, "learning_rate": 6.707717740353059e-08, "loss": 0.0354, "reward": 2.08984375, "reward_std": 0.34828995913267136, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2417 }, { "clip_ratio": 0.0, "completion_length": 760.8828125, "epoch": 0.9672, "grad_norm": 0.3094820189186323, "kl": 0.1334228515625, "learning_rate": 6.547235216672443e-08, "loss": 0.0257, "reward": 2.103515625, "reward_std": 0.30458373576402664, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.978515625, "step": 2418 }, { "clip_ratio": 0.0, "completion_length": 786.1796875, "epoch": 0.9676, "grad_norm": 0.23684660669641575, "kl": 0.1214599609375, "learning_rate": 6.388689479991606e-08, "loss": 0.0901, "reward": 2.0, "reward_std": 0.51716548204422, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.9296875, "step": 2419 }, { "clip_ratio": 0.0, "completion_length": 780.703125, "epoch": 0.968, "grad_norm": 0.9148268455999625, "kl": 0.143798828125, "learning_rate": 6.232080839403631e-08, "loss": 0.0648, "reward": 2.125, "reward_std": 0.46819616109132767, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.9375, "step": 2420 }, { "clip_ratio": 0.0, "completion_length": 728.859375, "epoch": 0.9684, "grad_norm": 0.27292449936596713, "kl": 0.126953125, "learning_rate": 6.07740960022507e-08, "loss": 0.0952, "reward": 1.947265625, "reward_std": 0.3367913216352463, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.955078125, "step": 2421 }, { "clip_ratio": 0.0, "completion_length": 788.96875, "epoch": 0.9688, "grad_norm": 0.22986144131638062, "kl": 0.11572265625, "learning_rate": 5.9246760639953824e-08, "loss": 0.0636, "reward": 2.04296875, "reward_std": 0.4174332395195961, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.96484375, "step": 2422 }, { "clip_ratio": 0.0, "completion_length": 764.28125, "epoch": 0.9692, "grad_norm": 0.3100598257747221, "kl": 0.14892578125, "learning_rate": 5.7738805284764945e-08, "loss": 0.0702, "reward": 1.814453125, "reward_std": 0.3104153797030449, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.908203125, "step": 2423 }, { "clip_ratio": 0.0, "completion_length": 638.125, "epoch": 0.9696, "grad_norm": 0.4401512025777835, "kl": 0.150634765625, "learning_rate": 5.625023287652021e-08, "loss": 0.1148, "reward": 2.0078125, "reward_std": 0.308236762881279, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2424 }, { "clip_ratio": 0.0, "completion_length": 765.4140625, "epoch": 0.97, "grad_norm": 0.2904909800115586, "kl": 0.1370849609375, "learning_rate": 5.4781046317267103e-08, "loss": 0.0693, "reward": 1.982421875, "reward_std": 0.3735928535461426, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2425 }, { "clip_ratio": 0.0, "completion_length": 672.4140625, "epoch": 0.9704, "grad_norm": 0.3371779457641124, "kl": 0.13916015625, "learning_rate": 5.3331248471258926e-08, "loss": 0.0664, "reward": 1.98828125, "reward_std": 0.2615520879626274, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2426 }, { "clip_ratio": 0.0, "completion_length": 806.9453125, "epoch": 0.9708, "grad_norm": 0.23645253215365408, "kl": 0.1185302734375, "learning_rate": 5.190084216495361e-08, "loss": 0.0238, "reward": 1.919921875, "reward_std": 0.3307662308216095, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.958984375, "step": 2427 }, { "clip_ratio": 0.0, "completion_length": 697.4921875, "epoch": 0.9712, "grad_norm": 0.23893457918708969, "kl": 0.1175537109375, "learning_rate": 5.048983018699827e-08, "loss": 0.0224, "reward": 2.169921875, "reward_std": 0.10310593992471695, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9921875, "rewards/tag_count_reward": 0.998046875, "step": 2428 }, { "clip_ratio": 0.0, "completion_length": 761.40625, "epoch": 0.9716, "grad_norm": 0.2803581677187435, "kl": 0.13134765625, "learning_rate": 4.9098215288235776e-08, "loss": 0.0335, "reward": 2.0078125, "reward_std": 0.35546497255563736, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2429 }, { "clip_ratio": 0.0, "completion_length": 719.671875, "epoch": 0.972, "grad_norm": 0.09871340835905443, "kl": 0.124755859375, "learning_rate": 4.772600018168816e-08, "loss": 0.0355, "reward": 2.138671875, "reward_std": 0.1679232344031334, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2430 }, { "clip_ratio": 0.0, "completion_length": 724.15625, "epoch": 0.9724, "grad_norm": 0.8430576991646751, "kl": 0.1224365234375, "learning_rate": 4.6373187542561036e-08, "loss": 0.0553, "reward": 2.146484375, "reward_std": 0.3156280145049095, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.974609375, "step": 2431 }, { "clip_ratio": 0.0, "completion_length": 777.0234375, "epoch": 0.9728, "grad_norm": 0.34283020949569454, "kl": 0.1427001953125, "learning_rate": 4.503978000823028e-08, "loss": 0.0848, "reward": 1.90234375, "reward_std": 0.46316882967948914, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9140625, "rewards/tag_count_reward": 0.94921875, "step": 2432 }, { "clip_ratio": 0.0, "completion_length": 713.1328125, "epoch": 0.9732, "grad_norm": 0.1871324543673278, "kl": 0.138671875, "learning_rate": 4.3725780178243135e-08, "loss": 0.0591, "reward": 1.876953125, "reward_std": 0.23877985030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2433 }, { "clip_ratio": 0.0, "completion_length": 715.1640625, "epoch": 0.9736, "grad_norm": 0.22536146543856156, "kl": 0.124267578125, "learning_rate": 4.2431190614309334e-08, "loss": 0.1438, "reward": 1.767578125, "reward_std": 0.41630594432353973, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.8671875, "rewards/tag_count_reward": 0.900390625, "step": 2434 }, { "clip_ratio": 0.0, "completion_length": 719.3125, "epoch": 0.974, "grad_norm": 0.624437976068448, "kl": 0.15673828125, "learning_rate": 4.115601384029666e-08, "loss": 0.0614, "reward": 1.97265625, "reward_std": 0.33732428401708603, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.93359375, "step": 2435 }, { "clip_ratio": 0.0, "completion_length": 732.2109375, "epoch": 0.9744, "grad_norm": 3.8224733370948836, "kl": 0.1298828125, "learning_rate": 3.990025234222872e-08, "loss": 0.0831, "reward": 1.94921875, "reward_std": 0.3557347357273102, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.92578125, "step": 2436 }, { "clip_ratio": 0.0, "completion_length": 779.109375, "epoch": 0.9748, "grad_norm": 0.35509144783972035, "kl": 0.1138916015625, "learning_rate": 3.866390856827495e-08, "loss": 0.0552, "reward": 2.072265625, "reward_std": 0.3783107027411461, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2437 }, { "clip_ratio": 0.0, "completion_length": 817.9296875, "epoch": 0.9752, "grad_norm": 0.27866122296146345, "kl": 0.1285400390625, "learning_rate": 3.7446984928753984e-08, "loss": 0.0298, "reward": 2.015625, "reward_std": 0.34034234285354614, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.953125, "step": 2438 }, { "clip_ratio": 0.0, "completion_length": 651.9609375, "epoch": 0.9756, "grad_norm": 0.33350755551363287, "kl": 0.140625, "learning_rate": 3.6249483796116924e-08, "loss": 0.0403, "reward": 2.08984375, "reward_std": 0.18637026846408844, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2439 }, { "clip_ratio": 0.0, "completion_length": 741.1640625, "epoch": 0.976, "grad_norm": 0.22317577321376506, "kl": 0.12109375, "learning_rate": 3.50714075049563e-08, "loss": 0.0477, "reward": 1.92578125, "reward_std": 0.24883579462766647, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2440 }, { "clip_ratio": 0.0, "completion_length": 663.109375, "epoch": 0.9764, "grad_norm": 0.23081712795761988, "kl": 0.1265869140625, "learning_rate": 3.391275835199159e-08, "loss": 0.0708, "reward": 2.046875, "reward_std": 0.22789177298545837, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2441 }, { "clip_ratio": 0.0, "completion_length": 658.6640625, "epoch": 0.9768, "grad_norm": 1.6641380050311125, "kl": 0.1331787109375, "learning_rate": 3.2773538596068134e-08, "loss": 0.0938, "reward": 2.185546875, "reward_std": 0.3802133947610855, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2442 }, { "clip_ratio": 0.0, "completion_length": 756.515625, "epoch": 0.9772, "grad_norm": 0.45312742772202436, "kl": 0.1337890625, "learning_rate": 3.165375045815266e-08, "loss": 0.111, "reward": 1.966796875, "reward_std": 0.35926565527915955, "rewards/accuracy_reward": 0.203125, "rewards/format_reward": 0.8515625, "rewards/tag_count_reward": 0.912109375, "step": 2443 }, { "clip_ratio": 0.0, "completion_length": 737.546875, "epoch": 0.9776, "grad_norm": 0.28577135699765893, "kl": 0.1025390625, "learning_rate": 3.0553396121330015e-08, "loss": 0.0643, "reward": 1.994140625, "reward_std": 0.31518368422985077, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 2444 }, { "clip_ratio": 0.0, "completion_length": 688.6875, "epoch": 0.978, "grad_norm": 0.3844302921893115, "kl": 0.140625, "learning_rate": 2.947247773079753e-08, "loss": 0.0764, "reward": 1.94921875, "reward_std": 0.34885483235120773, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2445 }, { "clip_ratio": 0.0, "completion_length": 645.265625, "epoch": 0.9784, "grad_norm": 1.0796165867221117, "kl": 0.1591796875, "learning_rate": 2.8410997393860663e-08, "loss": 0.0586, "reward": 2.24609375, "reward_std": 0.2088528871536255, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98828125, "step": 2446 }, { "clip_ratio": 0.0, "completion_length": 697.609375, "epoch": 0.9788, "grad_norm": 0.3693832536627311, "kl": 0.1148681640625, "learning_rate": 2.7368957179929602e-08, "loss": 0.0916, "reward": 2.1796875, "reward_std": 0.26786844432353973, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2447 }, { "clip_ratio": 0.0, "completion_length": 687.8359375, "epoch": 0.9792, "grad_norm": 0.22428900256242745, "kl": 0.130126953125, "learning_rate": 2.6346359120514863e-08, "loss": 0.0637, "reward": 2.060546875, "reward_std": 0.2319396734237671, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2448 }, { "clip_ratio": 0.0, "completion_length": 735.8359375, "epoch": 0.9796, "grad_norm": 0.29006712415869484, "kl": 0.1142578125, "learning_rate": 2.5343205209225062e-08, "loss": 0.0525, "reward": 1.986328125, "reward_std": 0.46049046516418457, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2449 }, { "clip_ratio": 0.0, "completion_length": 808.0546875, "epoch": 0.98, "grad_norm": 0.15381901904220513, "kl": 0.1146240234375, "learning_rate": 2.4359497401758026e-08, "loss": 0.0293, "reward": 1.96484375, "reward_std": 0.19205471873283386, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.98046875, "step": 2450 }, { "clip_ratio": 0.0, "completion_length": 708.125, "epoch": 0.9804, "grad_norm": 0.8881325632163971, "kl": 0.1448974609375, "learning_rate": 2.339523761590301e-08, "loss": 0.0964, "reward": 2.05078125, "reward_std": 0.4749554395675659, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.859375, "rewards/tag_count_reward": 0.90234375, "step": 2451 }, { "clip_ratio": 0.0, "completion_length": 757.78125, "epoch": 0.9808, "grad_norm": 0.3140229668899423, "kl": 0.12744140625, "learning_rate": 2.2450427731534052e-08, "loss": 0.0712, "reward": 2.06640625, "reward_std": 0.3821433112025261, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2452 }, { "clip_ratio": 0.0, "completion_length": 646.9921875, "epoch": 0.9812, "grad_norm": 0.4256613480736053, "kl": 0.1572265625, "learning_rate": 2.152506959060774e-08, "loss": 0.1087, "reward": 1.994140625, "reward_std": 0.4033687263727188, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2453 }, { "clip_ratio": 0.0, "completion_length": 732.578125, "epoch": 0.9816, "grad_norm": 0.2506841977339269, "kl": 0.1302490234375, "learning_rate": 2.061916499715544e-08, "loss": 0.0481, "reward": 1.931640625, "reward_std": 0.2808302640914917, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 2454 }, { "clip_ratio": 0.0, "completion_length": 791.8203125, "epoch": 0.982, "grad_norm": 0.20255695145763553, "kl": 0.121337890625, "learning_rate": 1.973271571728441e-08, "loss": 0.0547, "reward": 1.966796875, "reward_std": 0.34815485030412674, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.966796875, "step": 2455 }, { "clip_ratio": 0.0, "completion_length": 679.5546875, "epoch": 0.9824, "grad_norm": 0.6440406361369407, "kl": 0.15771484375, "learning_rate": 1.886572347917337e-08, "loss": 0.0867, "reward": 1.9296875, "reward_std": 0.43484821915626526, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.953125, "step": 2456 }, { "clip_ratio": 0.0, "completion_length": 813.5234375, "epoch": 0.9828, "grad_norm": 0.35289164197270956, "kl": 0.1317138671875, "learning_rate": 1.8018189973069144e-08, "loss": 0.0828, "reward": 1.91015625, "reward_std": 0.5252429693937302, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.90625, "rewards/tag_count_reward": 0.94140625, "step": 2457 }, { "clip_ratio": 0.0, "completion_length": 597.21875, "epoch": 0.9832, "grad_norm": 0.3868387476394967, "kl": 0.1258544921875, "learning_rate": 1.7190116851280024e-08, "loss": 0.0886, "reward": 1.98046875, "reward_std": 0.31299371272325516, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2458 }, { "clip_ratio": 0.0, "completion_length": 719.9609375, "epoch": 0.9836, "grad_norm": 0.9250542932996471, "kl": 0.1363525390625, "learning_rate": 1.6381505728176872e-08, "loss": 0.1466, "reward": 1.966796875, "reward_std": 0.37059489637613297, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.8984375, "rewards/tag_count_reward": 0.935546875, "step": 2459 }, { "clip_ratio": 0.0, "completion_length": 735.46875, "epoch": 0.984, "grad_norm": 1.4952504420521908, "kl": 0.139404296875, "learning_rate": 1.5592358180189782e-08, "loss": 0.0719, "reward": 1.998046875, "reward_std": 0.4043347090482712, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.951171875, "step": 2460 }, { "clip_ratio": 0.0, "completion_length": 754.9765625, "epoch": 0.9844, "grad_norm": 0.15728204880544502, "kl": 0.119384765625, "learning_rate": 1.482267574580143e-08, "loss": 0.0255, "reward": 2.072265625, "reward_std": 0.15673638880252838, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.986328125, "step": 2461 }, { "clip_ratio": 0.0, "completion_length": 730.9296875, "epoch": 0.9848, "grad_norm": 0.316213684115395, "kl": 0.14111328125, "learning_rate": 1.4072459925548176e-08, "loss": 0.0616, "reward": 1.912109375, "reward_std": 0.32253528386354446, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2462 }, { "clip_ratio": 0.0, "completion_length": 742.3671875, "epoch": 0.9852, "grad_norm": 0.517835617180784, "kl": 0.1278076171875, "learning_rate": 1.3341712182012301e-08, "loss": 0.0486, "reward": 2.001953125, "reward_std": 0.3685624524950981, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2463 }, { "clip_ratio": 0.0, "completion_length": 762.5078125, "epoch": 0.9856, "grad_norm": 2.0967892851149172, "kl": 0.128173828125, "learning_rate": 1.2630433939825326e-08, "loss": 0.0654, "reward": 2.087890625, "reward_std": 0.38015166670084, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2464 }, { "clip_ratio": 0.0, "completion_length": 733.078125, "epoch": 0.986, "grad_norm": 0.2159813218704811, "kl": 0.12841796875, "learning_rate": 1.1938626585660252e-08, "loss": 0.0791, "reward": 2.0234375, "reward_std": 0.39379390329122543, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2465 }, { "clip_ratio": 0.0, "completion_length": 795.59375, "epoch": 0.9864, "grad_norm": 0.3244598725021268, "kl": 0.1219482421875, "learning_rate": 1.126629146822933e-08, "loss": 0.0938, "reward": 1.90234375, "reward_std": 0.5483555793762207, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.890625, "rewards/tag_count_reward": 0.93359375, "step": 2466 }, { "clip_ratio": 0.0, "completion_length": 640.6171875, "epoch": 0.9868, "grad_norm": 0.17090261311588026, "kl": 0.128173828125, "learning_rate": 1.0613429898287397e-08, "loss": 0.0307, "reward": 2.130859375, "reward_std": 0.16524656862020493, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2467 }, { "clip_ratio": 0.0, "completion_length": 594.421875, "epoch": 0.9872, "grad_norm": 0.257584556246939, "kl": 0.123046875, "learning_rate": 9.980043148619668e-09, "loss": 0.0683, "reward": 2.216796875, "reward_std": 0.31908590346574783, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2468 }, { "clip_ratio": 0.0, "completion_length": 732.7890625, "epoch": 0.9876, "grad_norm": 0.48473246114640467, "kl": 0.130126953125, "learning_rate": 9.366132454046162e-09, "loss": 0.0913, "reward": 1.953125, "reward_std": 0.4132962301373482, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.875, "rewards/tag_count_reward": 0.9140625, "step": 2469 }, { "clip_ratio": 0.0, "completion_length": 685.0546875, "epoch": 0.988, "grad_norm": 0.44866268374411294, "kl": 0.1259765625, "learning_rate": 8.771699011416169e-09, "loss": 0.0991, "reward": 2.060546875, "reward_std": 0.5212081745266914, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.958984375, "step": 2470 }, { "clip_ratio": 0.0, "completion_length": 694.3125, "epoch": 0.9884, "grad_norm": 0.2950526284212562, "kl": 0.1334228515625, "learning_rate": 8.196743979610455e-09, "loss": 0.0194, "reward": 2.296875, "reward_std": 0.190364770591259, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2471 }, { "clip_ratio": 0.0, "completion_length": 744.4765625, "epoch": 0.9888, "grad_norm": 0.3752219117208888, "kl": 0.132080078125, "learning_rate": 7.641268479531283e-09, "loss": 0.0751, "reward": 1.96484375, "reward_std": 0.47012338042259216, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.921875, "rewards/tag_count_reward": 0.94921875, "step": 2472 }, { "clip_ratio": 0.0, "completion_length": 707.28125, "epoch": 0.9892, "grad_norm": 0.577847821802471, "kl": 0.1273193359375, "learning_rate": 7.105273594107953e-09, "loss": 0.0474, "reward": 1.92578125, "reward_std": 0.21728545427322388, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2473 }, { "clip_ratio": 0.0, "completion_length": 707.2578125, "epoch": 0.9896, "grad_norm": 0.29225570585586164, "kl": 0.1326904296875, "learning_rate": 6.588760368287928e-09, "loss": 0.1104, "reward": 2.015625, "reward_std": 0.3681846931576729, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.953125, "step": 2474 }, { "clip_ratio": 0.0, "completion_length": 733.0703125, "epoch": 0.99, "grad_norm": 0.5821311917221883, "kl": 0.11669921875, "learning_rate": 6.091729809042379e-09, "loss": 0.0571, "reward": 1.96484375, "reward_std": 0.29623471200466156, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.97265625, "step": 2475 }, { "clip_ratio": 0.0, "completion_length": 771.9375, "epoch": 0.9904, "grad_norm": 0.29435644759220775, "kl": 0.116455078125, "learning_rate": 5.614182885357311e-09, "loss": 0.058, "reward": 2.05859375, "reward_std": 0.39657649397850037, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.95703125, "step": 2476 }, { "clip_ratio": 0.0, "completion_length": 728.6171875, "epoch": 0.9908, "grad_norm": 0.6775623254232642, "kl": 0.146484375, "learning_rate": 5.156120528233555e-09, "loss": 0.0549, "reward": 1.974609375, "reward_std": 0.29594768583774567, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2477 }, { "clip_ratio": 0.0, "completion_length": 654.34375, "epoch": 0.9912, "grad_norm": 0.6386954025906881, "kl": 0.135009765625, "learning_rate": 4.717543630688992e-09, "loss": 0.0338, "reward": 1.953125, "reward_std": 0.21534235030412674, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2478 }, { "clip_ratio": 0.0, "completion_length": 658.5625, "epoch": 0.9916, "grad_norm": 0.26414920964778976, "kl": 0.12109375, "learning_rate": 4.298453047749674e-09, "loss": 0.0652, "reward": 2.083984375, "reward_std": 0.2770504206418991, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.974609375, "step": 2479 }, { "clip_ratio": 0.0, "completion_length": 665.1796875, "epoch": 0.992, "grad_norm": 0.2555144996266575, "kl": 0.1368408203125, "learning_rate": 3.898849596456477e-09, "loss": 0.0955, "reward": 1.994140625, "reward_std": 0.3716190308332443, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.962890625, "step": 2480 }, { "clip_ratio": 0.0, "completion_length": 637.359375, "epoch": 0.9924, "grad_norm": 0.28690486628671974, "kl": 0.1229248046875, "learning_rate": 3.518734055855122e-09, "loss": 0.0772, "reward": 1.9921875, "reward_std": 0.32191721349954605, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2481 }, { "clip_ratio": 0.0, "completion_length": 753.5546875, "epoch": 0.9928, "grad_norm": 0.5718129393010967, "kl": 0.1295166015625, "learning_rate": 3.1581071670006013e-09, "loss": 0.0843, "reward": 2.009765625, "reward_std": 0.45324908196926117, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.947265625, "step": 2482 }, { "clip_ratio": 0.0, "completion_length": 581.2265625, "epoch": 0.9932, "grad_norm": 0.7060649291924707, "kl": 0.140380859375, "learning_rate": 2.8169696329527484e-09, "loss": 0.1111, "reward": 2.056640625, "reward_std": 0.3423817902803421, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.970703125, "step": 2483 }, { "clip_ratio": 0.0, "completion_length": 714.921875, "epoch": 0.9936, "grad_norm": 0.6024794763578734, "kl": 0.12451171875, "learning_rate": 2.495322118778454e-09, "loss": 0.0357, "reward": 2.09765625, "reward_std": 0.109375, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.98828125, "step": 2484 }, { "clip_ratio": 0.0, "completion_length": 701.1640625, "epoch": 0.994, "grad_norm": 0.18506185951300105, "kl": 0.1141357421875, "learning_rate": 2.193165251545004e-09, "loss": 0.0336, "reward": 1.958984375, "reward_std": 0.23232798278331757, "rewards/accuracy_reward": 0.015625, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.974609375, "step": 2485 }, { "clip_ratio": 0.0, "completion_length": 701.7578125, "epoch": 0.9944, "grad_norm": 0.22008278914681734, "kl": 0.131103515625, "learning_rate": 1.910499620322304e-09, "loss": 0.0524, "reward": 1.916015625, "reward_std": 0.23966552317142487, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9453125, "rewards/tag_count_reward": 0.970703125, "step": 2486 }, { "clip_ratio": 0.0, "completion_length": 660.25, "epoch": 0.9948, "grad_norm": 0.2709874890099565, "kl": 0.1356201171875, "learning_rate": 1.647325776182873e-09, "loss": 0.1106, "reward": 2.078125, "reward_std": 0.27704577147960663, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2487 }, { "clip_ratio": 0.0, "completion_length": 593.7734375, "epoch": 0.9952, "grad_norm": 0.6437886278729968, "kl": 0.1719970703125, "learning_rate": 1.4036442321962995e-09, "loss": 0.0421, "reward": 2.0859375, "reward_std": 0.18195747584104538, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.984375, "step": 2488 }, { "clip_ratio": 0.0, "completion_length": 619.3515625, "epoch": 0.9956, "grad_norm": 0.30897566377652164, "kl": 0.133056640625, "learning_rate": 1.1794554634314558e-09, "loss": 0.065, "reward": 2.0625, "reward_std": 0.2771770879626274, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.96875, "rewards/tag_count_reward": 0.9765625, "step": 2489 }, { "clip_ratio": 0.0, "completion_length": 741.9921875, "epoch": 0.996, "grad_norm": 0.12845238791900107, "kl": 0.115234375, "learning_rate": 9.74759906957612e-10, "loss": 0.0211, "reward": 1.958984375, "reward_std": 0.12940485030412674, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.9765625, "rewards/tag_count_reward": 0.982421875, "step": 2490 }, { "clip_ratio": 0.0, "completion_length": 746.3515625, "epoch": 0.9964, "grad_norm": 0.17384014623034363, "kl": 0.1177978515625, "learning_rate": 7.895579618388827e-10, "loss": 0.0455, "reward": 1.92578125, "reward_std": 0.26784779131412506, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2491 }, { "clip_ratio": 0.0, "completion_length": 740.0390625, "epoch": 0.9968, "grad_norm": 0.3432673058686426, "kl": 0.1317138671875, "learning_rate": 6.238499891353389e-10, "loss": 0.063, "reward": 2.0078125, "reward_std": 0.3517526537179947, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.9765625, "step": 2492 }, { "clip_ratio": 0.0, "completion_length": 709.1796875, "epoch": 0.9972, "grad_norm": 0.8275580400108304, "kl": 0.13232421875, "learning_rate": 4.77636311903007e-10, "loss": 0.1396, "reward": 1.935546875, "reward_std": 0.4293004497885704, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.9296875, "rewards/tag_count_reward": 0.951171875, "step": 2493 }, { "clip_ratio": 0.0, "completion_length": 696.90625, "epoch": 0.9976, "grad_norm": 0.5555126687669251, "kl": 0.123291015625, "learning_rate": 3.509172151938689e-10, "loss": 0.0823, "reward": 2.31640625, "reward_std": 0.40190740674734116, "rewards/accuracy_reward": 0.3984375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96484375, "step": 2494 }, { "clip_ratio": 0.0, "completion_length": 704.6953125, "epoch": 0.998, "grad_norm": 0.4966872323313065, "kl": 0.1314697265625, "learning_rate": 2.436929460525317e-10, "loss": 0.0921, "reward": 2.029296875, "reward_std": 0.3436766341328621, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.951171875, "step": 2495 }, { "clip_ratio": 0.0, "completion_length": 686.4609375, "epoch": 0.9984, "grad_norm": 0.27110605999736315, "kl": 0.12353515625, "learning_rate": 1.559637135173375e-10, "loss": 0.0739, "reward": 2.1484375, "reward_std": 0.42450354248285294, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.9609375, "rewards/tag_count_reward": 0.96875, "step": 2496 }, { "clip_ratio": 0.0, "completion_length": 739.828125, "epoch": 0.9988, "grad_norm": 0.3530294236070716, "kl": 0.148193359375, "learning_rate": 8.772968862369447e-11, "loss": 0.0608, "reward": 2.0703125, "reward_std": 0.28700755536556244, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.96875, "step": 2497 }, { "clip_ratio": 0.0, "completion_length": 743.5546875, "epoch": 0.9992, "grad_norm": 0.3030232762644485, "kl": 0.1395263671875, "learning_rate": 3.899100439408443e-11, "loss": 0.0512, "reward": 1.962890625, "reward_std": 0.29776880890130997, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.953125, "rewards/tag_count_reward": 0.962890625, "step": 2498 }, { "clip_ratio": 0.0, "completion_length": 706.734375, "epoch": 0.9996, "grad_norm": 0.38010100208386516, "kl": 0.12646484375, "learning_rate": 9.74775584916543e-12, "loss": 0.0173, "reward": 2.1171875, "reward_std": 0.10774768888950348, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.984375, "rewards/tag_count_reward": 0.9921875, "step": 2499 }, { "clip_ratio": 0.0, "completion_length": 706.015625, "epoch": 1.0, "grad_norm": 0.3991506854301767, "kl": 0.1396484375, "learning_rate": 0.0, "loss": 0.0601, "reward": 1.99609375, "reward_std": 0.39103028923273087, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.9375, "rewards/tag_count_reward": 0.95703125, "step": 2500 }, { "epoch": 1.0, "step": 2500, "total_flos": 0.0, "train_loss": 162.96194878991443, "train_runtime": 154205.9094, "train_samples_per_second": 0.13, "train_steps_per_second": 0.016 } ], "logging_steps": 1, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }