{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0002, "grad_norm": 3.0746591091156006, "kl": 0.0001370312529616058, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 8600.0, "reward": 0.700439453125, "reward_std": 0.014704298228025436, "rewards//mean": 0.700439453125, "rewards//std": 0.04464063048362732, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0004, "grad_norm": 2.9869141578674316, "kl": 6.267779826885089e-05, "learning_rate": 2e-08, "loss": 0.0, "num_tokens": 17200.0, "reward": 0.73077392578125, "reward_std": 0.015316192060709, "rewards//mean": 0.73077392578125, "rewards//std": 0.05491425469517708, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0006, "grad_norm": 3.3335037231445312, "kl": 0.0005714244398404844, "learning_rate": 4e-08, "loss": 0.0001, "num_tokens": 25872.0, "reward": 0.742431640625, "reward_std": 0.012949886731803417, "rewards//mean": 0.742431640625, "rewards//std": 0.04560147598385811, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0008, "grad_norm": 3.0980048179626465, "kl": 0.0005316020688042045, "learning_rate": 6e-08, "loss": 0.0001, "num_tokens": 34600.0, "reward": 0.715576171875, "reward_std": 0.015215152874588966, "rewards//mean": 0.715576171875, "rewards//std": 0.050897374749183655, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.001, "grad_norm": 3.049089193344116, "kl": 0.000517527550982777, "learning_rate": 8e-08, "loss": 0.0001, "num_tokens": 43304.0, "reward": 0.71856689453125, "reward_std": 0.01438464131206274, "rewards//mean": 0.71856689453125, "rewards//std": 0.05342591553926468, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0012, "grad_norm": 2.9623756408691406, "kl": 0.0005474825884448364, "learning_rate": 1e-07, "loss": 0.0001, "num_tokens": 51992.0, "reward": 0.713134765625, "reward_std": 0.012754758819937706, "rewards//mean": 0.713134765625, "rewards//std": 0.05878513306379318, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0014, "grad_norm": 3.1190226078033447, "kl": 0.0005763711305917241, "learning_rate": 1.2e-07, "loss": 0.0001, "num_tokens": 60696.0, "reward": 0.7222900390625, "reward_std": 0.014144840650260448, "rewards//mean": 0.7222900390625, "rewards//std": 0.03965155407786369, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0016, "grad_norm": 2.9739928245544434, "kl": 0.0005803547683171928, "learning_rate": 1.4e-07, "loss": 0.0001, "num_tokens": 69336.0, "reward": 0.732421875, "reward_std": 0.01358483824878931, "rewards//mean": 0.732421875, "rewards//std": 0.05295494943857193, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0018, "grad_norm": 3.1628496646881104, "kl": 0.0005657363144564442, "learning_rate": 1.6e-07, "loss": 0.0001, "num_tokens": 78008.0, "reward": 0.70953369140625, "reward_std": 0.013684568926692009, "rewards//mean": 0.70953369140625, "rewards//std": 0.05245111510157585, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.002, "grad_norm": 3.1755638122558594, "kl": 0.0005453915146063082, "learning_rate": 1.8e-07, "loss": 0.0001, "num_tokens": 86648.0, "reward": 0.68017578125, "reward_std": 0.014631778001785278, "rewards//mean": 0.68017578125, "rewards//std": 0.04801594093441963, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0022, "grad_norm": 3.0773203372955322, "kl": 0.0005841563324793242, "learning_rate": 2e-07, "loss": 0.0001, "num_tokens": 95456.0, "reward": 0.69476318359375, "reward_std": 0.012789730913937092, "rewards//mean": 0.69476318359375, "rewards//std": 0.043625302612781525, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0024, "grad_norm": 3.3067240715026855, "kl": 0.0005825260886922479, "learning_rate": 2.1999999999999998e-07, "loss": 0.0001, "num_tokens": 104112.0, "reward": 0.707763671875, "reward_std": 0.014192605391144753, "rewards//mean": 0.707763671875, "rewards//std": 0.04431390017271042, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0026, "grad_norm": 3.052741527557373, "kl": 0.0005862039251951501, "learning_rate": 2.4e-07, "loss": 0.0001, "num_tokens": 112728.0, "reward": 0.7296142578125, "reward_std": 0.01562296599149704, "rewards//mean": 0.7296142578125, "rewards//std": 0.04462214559316635, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0028, "grad_norm": 3.496222496032715, "kl": 0.000546930474229157, "learning_rate": 2.6e-07, "loss": 0.0001, "num_tokens": 121352.0, "reward": 0.75439453125, "reward_std": 0.020952299237251282, "rewards//mean": 0.75439453125, "rewards//std": 0.04351207613945007, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.003, "grad_norm": 3.0683610439300537, "kl": 0.0005508811809704639, "learning_rate": 2.8e-07, "loss": 0.0001, "num_tokens": 130072.0, "reward": 0.72918701171875, "reward_std": 0.02001885510981083, "rewards//mean": 0.72918701171875, "rewards//std": 0.059948600828647614, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0032, "grad_norm": 3.168755292892456, "kl": 0.0005511691051651724, "learning_rate": 3e-07, "loss": 0.0001, "num_tokens": 138680.0, "reward": 0.731689453125, "reward_std": 0.015139667317271233, "rewards//mean": 0.731689453125, "rewards//std": 0.04729103669524193, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0034, "grad_norm": 3.126718044281006, "kl": 0.000573088698729407, "learning_rate": 3.2e-07, "loss": 0.0001, "num_tokens": 147320.0, "reward": 0.68780517578125, "reward_std": 0.01579727604985237, "rewards//mean": 0.68780517578125, "rewards//std": 0.058778662234544754, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0036, "grad_norm": 3.0903584957122803, "kl": 0.0005722851856262423, "learning_rate": 3.4000000000000003e-07, "loss": 0.0001, "num_tokens": 155984.0, "reward": 0.71990966796875, "reward_std": 0.012143155559897423, "rewards//mean": 0.71990966796875, "rewards//std": 0.047596342861652374, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0038, "grad_norm": 3.2795073986053467, "kl": 0.000557706574909389, "learning_rate": 3.6e-07, "loss": 0.0001, "num_tokens": 164608.0, "reward": 0.702392578125, "reward_std": 0.021895065903663635, "rewards//mean": 0.702392578125, "rewards//std": 0.06125873699784279, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.004, "grad_norm": 3.1292951107025146, "kl": 0.0005708587632398121, "learning_rate": 3.7999999999999996e-07, "loss": 0.0001, "num_tokens": 173168.0, "reward": 0.72113037109375, "reward_std": 0.0165967158973217, "rewards//mean": 0.72113037109375, "rewards//std": 0.05302632972598076, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0042, "grad_norm": 3.0912601947784424, "kl": 0.0005462863045977429, "learning_rate": 4e-07, "loss": 0.0001, "num_tokens": 181792.0, "reward": 0.72027587890625, "reward_std": 0.015079968608915806, "rewards//mean": 0.72027587890625, "rewards//std": 0.0568997748196125, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0044, "grad_norm": 3.1044301986694336, "kl": 0.0005402523092925549, "learning_rate": 4.1999999999999995e-07, "loss": 0.0001, "num_tokens": 190392.0, "reward": 0.73260498046875, "reward_std": 0.01755766198039055, "rewards//mean": 0.73260498046875, "rewards//std": 0.03580089658498764, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0046, "grad_norm": 3.1160430908203125, "kl": 0.0005472921475302428, "learning_rate": 4.3999999999999997e-07, "loss": 0.0001, "num_tokens": 199080.0, "reward": 0.69964599609375, "reward_std": 0.015488953329622746, "rewards//mean": 0.69964599609375, "rewards//std": 0.04417289048433304, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0048, "grad_norm": 3.093228340148926, "kl": 0.000571258133277297, "learning_rate": 4.6e-07, "loss": 0.0001, "num_tokens": 207720.0, "reward": 0.72613525390625, "reward_std": 0.015438448637723923, "rewards//mean": 0.72613525390625, "rewards//std": 0.05386695638298988, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.005, "grad_norm": 3.132035493850708, "kl": 0.0005702183116227388, "learning_rate": 4.8e-07, "loss": 0.0001, "num_tokens": 216296.0, "reward": 0.708251953125, "reward_std": 0.01361482311040163, "rewards//mean": 0.708251953125, "rewards//std": 0.0540633462369442, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0052, "grad_norm": 3.058742046356201, "kl": 0.0005953195868642069, "learning_rate": 5e-07, "loss": 0.0001, "num_tokens": 224968.0, "reward": 0.75274658203125, "reward_std": 0.021526148542761803, "rewards//mean": 0.75274658203125, "rewards//std": 0.048328571021556854, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0054, "grad_norm": 3.0073442459106445, "kl": 0.0005892160552321002, "learning_rate": 5.2e-07, "loss": 0.0001, "num_tokens": 233528.0, "reward": 0.67626953125, "reward_std": 0.01784338429570198, "rewards//mean": 0.67626953125, "rewards//std": 0.06989618390798569, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0056, "grad_norm": 3.2221901416778564, "kl": 0.0005890742468181998, "learning_rate": 5.4e-07, "loss": 0.0001, "num_tokens": 242280.0, "reward": 0.68499755859375, "reward_std": 0.014033805578947067, "rewards//mean": 0.68499755859375, "rewards//std": 0.04582660645246506, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0058, "grad_norm": 3.031466484069824, "kl": 0.000607152069278527, "learning_rate": 5.6e-07, "loss": 0.0001, "num_tokens": 250976.0, "reward": 0.712646484375, "reward_std": 0.016498101875185966, "rewards//mean": 0.712646484375, "rewards//std": 0.04970809444785118, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.006, "grad_norm": 3.025085687637329, "kl": 0.0005704485083697364, "learning_rate": 5.8e-07, "loss": 0.0001, "num_tokens": 259632.0, "reward": 0.70501708984375, "reward_std": 0.015339357778429985, "rewards//mean": 0.70501708984375, "rewards//std": 0.045702897012233734, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0062, "grad_norm": 3.0791728496551514, "kl": 0.0006308649681159295, "learning_rate": 6e-07, "loss": 0.0001, "num_tokens": 268208.0, "reward": 0.739013671875, "reward_std": 0.017989136278629303, "rewards//mean": 0.739013671875, "rewards//std": 0.0432741641998291, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0064, "grad_norm": 3.1217832565307617, "kl": 0.0006620676867896691, "learning_rate": 6.2e-07, "loss": 0.0001, "num_tokens": 276824.0, "reward": 0.70941162109375, "reward_std": 0.016736187040805817, "rewards//mean": 0.70941162109375, "rewards//std": 0.05383322015404701, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0066, "grad_norm": 3.2112765312194824, "kl": 0.0005905148573219776, "learning_rate": 6.4e-07, "loss": 0.0001, "num_tokens": 285520.0, "reward": 0.73883056640625, "reward_std": 0.017011437565088272, "rewards//mean": 0.73883056640625, "rewards//std": 0.04840962961316109, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0068, "grad_norm": 3.0612096786499023, "kl": 0.0005876668219571002, "learning_rate": 6.6e-07, "loss": 0.0001, "num_tokens": 294144.0, "reward": 0.70819091796875, "reward_std": 0.017199836671352386, "rewards//mean": 0.70819091796875, "rewards//std": 0.04584411159157753, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.007, "grad_norm": 3.0824496746063232, "kl": 0.0006361504347296432, "learning_rate": 6.800000000000001e-07, "loss": 0.0001, "num_tokens": 302888.0, "reward": 0.735107421875, "reward_std": 0.011731683276593685, "rewards//mean": 0.735107421875, "rewards//std": 0.037343598902225494, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0072, "grad_norm": 3.2275145053863525, "kl": 0.0007240941995405592, "learning_rate": 7e-07, "loss": 0.0001, "num_tokens": 311504.0, "reward": 0.71002197265625, "reward_std": 0.01906929537653923, "rewards//mean": 0.71002197265625, "rewards//std": 0.06062629818916321, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0074, "grad_norm": 3.2091662883758545, "kl": 0.000766696102800779, "learning_rate": 7.2e-07, "loss": 0.0001, "num_tokens": 320208.0, "reward": 0.72698974609375, "reward_std": 0.015764687210321426, "rewards//mean": 0.72698974609375, "rewards//std": 0.04941116273403168, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0076, "grad_norm": 3.056762933731079, "kl": 0.0006720083692925982, "learning_rate": 7.4e-07, "loss": 0.0001, "num_tokens": 328904.0, "reward": 0.7076416015625, "reward_std": 0.013192662969231606, "rewards//mean": 0.7076416015625, "rewards//std": 0.03601868078112602, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0078, "grad_norm": 3.024362564086914, "kl": 0.0007397676017717458, "learning_rate": 7.599999999999999e-07, "loss": 0.0001, "num_tokens": 337632.0, "reward": 0.7073974609375, "reward_std": 0.016517726704478264, "rewards//mean": 0.7073974609375, "rewards//std": 0.06210627406835556, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.008, "grad_norm": 3.030052423477173, "kl": 0.000783703027991578, "learning_rate": 7.799999999999999e-07, "loss": 0.0001, "num_tokens": 346264.0, "reward": 0.6876220703125, "reward_std": 0.01800483465194702, "rewards//mean": 0.6876220703125, "rewards//std": 0.04799812287092209, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0082, "grad_norm": 3.3742542266845703, "kl": 0.0007968554928083904, "learning_rate": 8e-07, "loss": 0.0001, "num_tokens": 354912.0, "reward": 0.72332763671875, "reward_std": 0.015946194529533386, "rewards//mean": 0.72332763671875, "rewards//std": 0.05200214684009552, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0084, "grad_norm": 2.983952522277832, "kl": 0.0008011522004380822, "learning_rate": 8.199999999999999e-07, "loss": 0.0001, "num_tokens": 363544.0, "reward": 0.7408447265625, "reward_std": 0.013474998995661736, "rewards//mean": 0.7408447265625, "rewards//std": 0.04879509657621384, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0086, "grad_norm": 3.248126268386841, "kl": 0.0008595763283665292, "learning_rate": 8.399999999999999e-07, "loss": 0.0001, "num_tokens": 372144.0, "reward": 0.73944091796875, "reward_std": 0.016470063477754593, "rewards//mean": 0.73944091796875, "rewards//std": 0.047130998224020004, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0088, "grad_norm": 3.0845632553100586, "kl": 0.0008815952387521975, "learning_rate": 8.599999999999999e-07, "loss": 0.0001, "num_tokens": 380848.0, "reward": 0.73309326171875, "reward_std": 0.01539240125566721, "rewards//mean": 0.73309326171875, "rewards//std": 0.04680288955569267, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.009, "grad_norm": 3.2412116527557373, "kl": 0.000932343871681951, "learning_rate": 8.799999999999999e-07, "loss": 0.0001, "num_tokens": 389504.0, "reward": 0.6689453125, "reward_std": 0.012580599635839462, "rewards//mean": 0.6689453125, "rewards//std": 0.04404045641422272, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0092, "grad_norm": 3.1200408935546875, "kl": 0.0009622859070077538, "learning_rate": 9e-07, "loss": 0.0001, "num_tokens": 398200.0, "reward": 0.72552490234375, "reward_std": 0.015103422105312347, "rewards//mean": 0.72552490234375, "rewards//std": 0.06293346732854843, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0094, "grad_norm": 3.1873373985290527, "kl": 0.001048956903105136, "learning_rate": 9.2e-07, "loss": 0.0001, "num_tokens": 406848.0, "reward": 0.701416015625, "reward_std": 0.0156770758330822, "rewards//mean": 0.701416015625, "rewards//std": 0.0535547137260437, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0096, "grad_norm": 3.0196340084075928, "kl": 0.0011119443515781313, "learning_rate": 9.399999999999999e-07, "loss": 0.0001, "num_tokens": 415520.0, "reward": 0.70697021484375, "reward_std": 0.016891758888959885, "rewards//mean": 0.70697021484375, "rewards//std": 0.057861633598804474, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0098, "grad_norm": 2.955000400543213, "kl": 0.0010805678321048617, "learning_rate": 9.6e-07, "loss": 0.0001, "num_tokens": 424088.0, "reward": 0.71795654296875, "reward_std": 0.016512058675289154, "rewards//mean": 0.71795654296875, "rewards//std": 0.05882345885038376, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.01, "grad_norm": 3.1035845279693604, "kl": 0.001265802318812348, "learning_rate": 9.8e-07, "loss": 0.0001, "num_tokens": 432696.0, "reward": 0.7044677734375, "reward_std": 0.014481520280241966, "rewards//mean": 0.7044677734375, "rewards//std": 0.049777936190366745, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0102, "grad_norm": 3.059105396270752, "kl": 0.0014140766143100336, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 441352.0, "reward": 0.7259521484375, "reward_std": 0.014453758485615253, "rewards//mean": 0.7259521484375, "rewards//std": 0.03619309142231941, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0104, "grad_norm": 3.0305216312408447, "kl": 0.001641553535591811, "learning_rate": 9.999998993000298e-07, "loss": 0.0002, "num_tokens": 449960.0, "reward": 0.74005126953125, "reward_std": 0.015035301446914673, "rewards//mean": 0.74005126953125, "rewards//std": 0.049174390733242035, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0106, "grad_norm": 3.1068880558013916, "kl": 0.0014859264192637056, "learning_rate": 9.999995972001601e-07, "loss": 0.0001, "num_tokens": 458512.0, "reward": 0.7259521484375, "reward_std": 0.019076917320489883, "rewards//mean": 0.7259521484375, "rewards//std": 0.045931410044431686, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0108, "grad_norm": 3.2462215423583984, "kl": 0.0017005849367706105, "learning_rate": 9.999990937005123e-07, "loss": 0.0002, "num_tokens": 467048.0, "reward": 0.72802734375, "reward_std": 0.01297299936413765, "rewards//mean": 0.72802734375, "rewards//std": 0.044377390295267105, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.011, "grad_norm": 2.8820033073425293, "kl": 0.0015795445651747286, "learning_rate": 9.999983888012896e-07, "loss": 0.0002, "num_tokens": 475728.0, "reward": 0.70489501953125, "reward_std": 0.01869776099920273, "rewards//mean": 0.70489501953125, "rewards//std": 0.06755409389734268, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0112, "grad_norm": 3.0614702701568604, "kl": 0.0019457548114587553, "learning_rate": 9.999974825027754e-07, "loss": 0.0002, "num_tokens": 484360.0, "reward": 0.7200927734375, "reward_std": 0.014355950988829136, "rewards//mean": 0.7200927734375, "rewards//std": 0.05347480624914169, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0114, "grad_norm": 3.1908984184265137, "kl": 0.0019651364054880105, "learning_rate": 9.999963748053354e-07, "loss": 0.0002, "num_tokens": 493000.0, "reward": 0.738525390625, "reward_std": 0.01591881364583969, "rewards//mean": 0.738525390625, "rewards//std": 0.04919874295592308, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0116, "grad_norm": 2.9364521503448486, "kl": 0.0021245284951874055, "learning_rate": 9.99995065709415e-07, "loss": 0.0002, "num_tokens": 501632.0, "reward": 0.719970703125, "reward_std": 0.01409243606030941, "rewards//mean": 0.719970703125, "rewards//std": 0.04894702881574631, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0118, "grad_norm": 3.003506660461426, "kl": 0.0020506469227257185, "learning_rate": 9.999935552155421e-07, "loss": 0.0002, "num_tokens": 510288.0, "reward": 0.72265625, "reward_std": 0.013906879350543022, "rewards//mean": 0.72265625, "rewards//std": 0.04887336865067482, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.012, "grad_norm": 3.354442596435547, "kl": 0.002489405007509049, "learning_rate": 9.99991843324325e-07, "loss": 0.0002, "num_tokens": 518952.0, "reward": 0.72845458984375, "reward_std": 0.0177521500736475, "rewards//mean": 0.72845458984375, "rewards//std": 0.04704001545906067, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0122, "grad_norm": 3.0271553993225098, "kl": 0.0023055829078657553, "learning_rate": 9.999899300364532e-07, "loss": 0.0002, "num_tokens": 527520.0, "reward": 0.71478271484375, "reward_std": 0.013522474095225334, "rewards//mean": 0.71478271484375, "rewards//std": 0.04290665686130524, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0124, "grad_norm": 3.1818554401397705, "kl": 0.0025641661195550114, "learning_rate": 9.999878153526972e-07, "loss": 0.0003, "num_tokens": 536112.0, "reward": 0.70977783203125, "reward_std": 0.01426254864782095, "rewards//mean": 0.70977783203125, "rewards//std": 0.051858142018318176, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0126, "grad_norm": 3.3307740688323975, "kl": 0.0030574638076359406, "learning_rate": 9.999854992739093e-07, "loss": 0.0003, "num_tokens": 544736.0, "reward": 0.70355224609375, "reward_std": 0.013520177453756332, "rewards//mean": 0.70355224609375, "rewards//std": 0.04383714869618416, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0128, "grad_norm": 3.1743972301483154, "kl": 0.0038234230887610465, "learning_rate": 9.999829818010219e-07, "loss": 0.0004, "num_tokens": 553408.0, "reward": 0.72265625, "reward_std": 0.015198908746242523, "rewards//mean": 0.72265625, "rewards//std": 0.04554367810487747, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.013, "grad_norm": 3.0791256427764893, "kl": 0.0031573468004353344, "learning_rate": 9.999802629350491e-07, "loss": 0.0003, "num_tokens": 562064.0, "reward": 0.72296142578125, "reward_std": 0.016398118808865547, "rewards//mean": 0.72296142578125, "rewards//std": 0.04709308221936226, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0132, "grad_norm": 3.233900785446167, "kl": 0.003446219547186047, "learning_rate": 9.999773426770863e-07, "loss": 0.0003, "num_tokens": 570664.0, "reward": 0.71954345703125, "reward_std": 0.015487316995859146, "rewards//mean": 0.71954345703125, "rewards//std": 0.04876014590263367, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0134, "grad_norm": 3.3780455589294434, "kl": 0.004207952646538615, "learning_rate": 9.999742210283097e-07, "loss": 0.0004, "num_tokens": 579360.0, "reward": 0.71832275390625, "reward_std": 0.013939326629042625, "rewards//mean": 0.71832275390625, "rewards//std": 0.04669441655278206, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0136, "grad_norm": 3.114373207092285, "kl": 0.004350957824499346, "learning_rate": 9.999708979899767e-07, "loss": 0.0004, "num_tokens": 587992.0, "reward": 0.7491455078125, "reward_std": 0.013725947588682175, "rewards//mean": 0.7491455078125, "rewards//std": 0.03409987688064575, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0138, "grad_norm": 3.0971555709838867, "kl": 0.004925672605168074, "learning_rate": 9.999673735634259e-07, "loss": 0.0005, "num_tokens": 596608.0, "reward": 0.7000732421875, "reward_std": 0.015324430540204048, "rewards//mean": 0.7000732421875, "rewards//std": 0.03915829584002495, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.014, "grad_norm": 2.964268445968628, "kl": 0.004543857765384018, "learning_rate": 9.999636477500764e-07, "loss": 0.0005, "num_tokens": 605248.0, "reward": 0.708251953125, "reward_std": 0.017837759107351303, "rewards//mean": 0.708251953125, "rewards//std": 0.05202692002058029, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0142, "grad_norm": 3.014002561569214, "kl": 0.005491413321578875, "learning_rate": 9.999597205514296e-07, "loss": 0.0005, "num_tokens": 613824.0, "reward": 0.6988525390625, "reward_std": 0.017433026805520058, "rewards//mean": 0.6988525390625, "rewards//std": 0.05124284327030182, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0144, "grad_norm": 2.9774134159088135, "kl": 0.0056799468729877844, "learning_rate": 9.999555919690672e-07, "loss": 0.0006, "num_tokens": 622328.0, "reward": 0.73919677734375, "reward_std": 0.014505268074572086, "rewards//mean": 0.73919677734375, "rewards//std": 0.05622804909944534, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0146, "grad_norm": 3.0427486896514893, "kl": 0.005467346069053747, "learning_rate": 9.99951262004652e-07, "loss": 0.0005, "num_tokens": 630976.0, "reward": 0.6893310546875, "reward_std": 0.01674201712012291, "rewards//mean": 0.6893310546875, "rewards//std": 0.037168681621551514, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0148, "grad_norm": 3.013369083404541, "kl": 0.005091004626592621, "learning_rate": 9.999467306599285e-07, "loss": 0.0005, "num_tokens": 639624.0, "reward": 0.73468017578125, "reward_std": 0.013643546961247921, "rewards//mean": 0.73468017578125, "rewards//std": 0.03605286404490471, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.015, "grad_norm": 3.043957471847534, "kl": 0.008441416663117707, "learning_rate": 9.999419979367214e-07, "loss": 0.0008, "num_tokens": 648320.0, "reward": 0.7056884765625, "reward_std": 0.015556867234408855, "rewards//mean": 0.7056884765625, "rewards//std": 0.05865868926048279, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0152, "grad_norm": 3.231454849243164, "kl": 0.008039619016926736, "learning_rate": 9.999370638369376e-07, "loss": 0.0008, "num_tokens": 657080.0, "reward": 0.7269287109375, "reward_std": 0.013333200477063656, "rewards//mean": 0.7269287109375, "rewards//std": 0.04582317918539047, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0154, "grad_norm": 3.110569715499878, "kl": 0.007806680485373363, "learning_rate": 9.99931928362564e-07, "loss": 0.0008, "num_tokens": 665720.0, "reward": 0.69854736328125, "reward_std": 0.016080046072602272, "rewards//mean": 0.69854736328125, "rewards//std": 0.060083795338869095, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0156, "grad_norm": 2.921905994415283, "kl": 0.007719275105046108, "learning_rate": 9.999265915156696e-07, "loss": 0.0008, "num_tokens": 674336.0, "reward": 0.720458984375, "reward_std": 0.015469206497073174, "rewards//mean": 0.720458984375, "rewards//std": 0.047003354877233505, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0158, "grad_norm": 3.413708448410034, "kl": 0.009065819176612422, "learning_rate": 9.999210532984038e-07, "loss": 0.0009, "num_tokens": 682968.0, "reward": 0.68780517578125, "reward_std": 0.017976250499486923, "rewards//mean": 0.68780517578125, "rewards//std": 0.06774518638849258, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.016, "grad_norm": 3.190436363220215, "kl": 0.008012848178623244, "learning_rate": 9.999153137129977e-07, "loss": 0.0008, "num_tokens": 691640.0, "reward": 0.74041748046875, "reward_std": 0.017986297607421875, "rewards//mean": 0.74041748046875, "rewards//std": 0.057460226118564606, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0162, "grad_norm": 3.164731502532959, "kl": 0.007367404759861529, "learning_rate": 9.999093727617628e-07, "loss": 0.0007, "num_tokens": 700264.0, "reward": 0.7017822265625, "reward_std": 0.016125842928886414, "rewards//mean": 0.7017822265625, "rewards//std": 0.035538043826818466, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0164, "grad_norm": 2.9602649211883545, "kl": 0.008762065350310877, "learning_rate": 9.999032304470924e-07, "loss": 0.0009, "num_tokens": 708984.0, "reward": 0.737548828125, "reward_std": 0.013302361592650414, "rewards//mean": 0.737548828125, "rewards//std": 0.04255741462111473, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0166, "grad_norm": 3.0777437686920166, "kl": 0.009679947281256318, "learning_rate": 9.998968867714608e-07, "loss": 0.001, "num_tokens": 717568.0, "reward": 0.73052978515625, "reward_std": 0.012443384155631065, "rewards//mean": 0.73052978515625, "rewards//std": 0.04523146152496338, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0168, "grad_norm": 3.295330762863159, "kl": 0.01055754155095201, "learning_rate": 9.998903417374226e-07, "loss": 0.0011, "num_tokens": 726304.0, "reward": 0.7119140625, "reward_std": 0.013127093203365803, "rewards//mean": 0.7119140625, "rewards//std": 0.04087439924478531, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.017, "grad_norm": 3.223742961883545, "kl": 0.010981887433445081, "learning_rate": 9.998835953476147e-07, "loss": 0.0011, "num_tokens": 735000.0, "reward": 0.73431396484375, "reward_std": 0.012129535898566246, "rewards//mean": 0.73431396484375, "rewards//std": 0.04945709556341171, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0172, "grad_norm": 3.227137327194214, "kl": 0.010623314272379503, "learning_rate": 9.998766476047545e-07, "loss": 0.0011, "num_tokens": 743648.0, "reward": 0.7049560546875, "reward_std": 0.017435938119888306, "rewards//mean": 0.7049560546875, "rewards//std": 0.06491030752658844, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0174, "grad_norm": 3.1377432346343994, "kl": 0.011226685048313811, "learning_rate": 9.998694985116404e-07, "loss": 0.0011, "num_tokens": 752416.0, "reward": 0.71868896484375, "reward_std": 0.011546581983566284, "rewards//mean": 0.71868896484375, "rewards//std": 0.06741254776716232, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0176, "grad_norm": 2.9976580142974854, "kl": 0.011882514314493164, "learning_rate": 9.99862148071152e-07, "loss": 0.0012, "num_tokens": 761040.0, "reward": 0.73583984375, "reward_std": 0.013298182748258114, "rewards//mean": 0.73583984375, "rewards//std": 0.05867636576294899, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0178, "grad_norm": 3.0490307807922363, "kl": 0.010607951902784407, "learning_rate": 9.998545962862501e-07, "loss": 0.0011, "num_tokens": 769656.0, "reward": 0.7401123046875, "reward_std": 0.018074776977300644, "rewards//mean": 0.7401123046875, "rewards//std": 0.05264287069439888, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.018, "grad_norm": 2.989919662475586, "kl": 0.012744891719194129, "learning_rate": 9.998468431599767e-07, "loss": 0.0013, "num_tokens": 778248.0, "reward": 0.7086181640625, "reward_std": 0.0159517303109169, "rewards//mean": 0.7086181640625, "rewards//std": 0.020442752167582512, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0182, "grad_norm": 3.2538132667541504, "kl": 0.013988890452310443, "learning_rate": 9.998388886954545e-07, "loss": 0.0014, "num_tokens": 786856.0, "reward": 0.69415283203125, "reward_std": 0.017283614724874496, "rewards//mean": 0.69415283203125, "rewards//std": 0.0727759599685669, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0184, "grad_norm": 3.338348388671875, "kl": 0.013513089710613713, "learning_rate": 9.998307328958877e-07, "loss": 0.0014, "num_tokens": 795544.0, "reward": 0.696533203125, "reward_std": 0.017394915223121643, "rewards//mean": 0.696533203125, "rewards//std": 0.06082624942064285, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0186, "grad_norm": 3.17159104347229, "kl": 0.012391012947773561, "learning_rate": 9.998223757645617e-07, "loss": 0.0012, "num_tokens": 804104.0, "reward": 0.7415771484375, "reward_std": 0.01519560907036066, "rewards//mean": 0.7415771484375, "rewards//std": 0.05159846320748329, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0188, "grad_norm": 2.9525721073150635, "kl": 0.013435031520202756, "learning_rate": 9.998138173048423e-07, "loss": 0.0013, "num_tokens": 812768.0, "reward": 0.74493408203125, "reward_std": 0.018135907128453255, "rewards//mean": 0.74493408203125, "rewards//std": 0.0533226802945137, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.019, "grad_norm": 3.0639989376068115, "kl": 0.013018126715905964, "learning_rate": 9.99805057520177e-07, "loss": 0.0013, "num_tokens": 821400.0, "reward": 0.74066162109375, "reward_std": 0.013175277039408684, "rewards//mean": 0.74066162109375, "rewards//std": 0.04255594685673714, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0192, "grad_norm": 3.360504627227783, "kl": 0.011382284079445526, "learning_rate": 9.997960964140945e-07, "loss": 0.0011, "num_tokens": 829952.0, "reward": 0.7005615234375, "reward_std": 0.011306056752800941, "rewards//mean": 0.7005615234375, "rewards//std": 0.03229600936174393, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0194, "grad_norm": 3.1438801288604736, "kl": 0.01552145613823086, "learning_rate": 9.99786933990204e-07, "loss": 0.0016, "num_tokens": 838520.0, "reward": 0.71697998046875, "reward_std": 0.011182492598891258, "rewards//mean": 0.71697998046875, "rewards//std": 0.04406755790114403, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0196, "grad_norm": 3.20588755607605, "kl": 0.015421856835018843, "learning_rate": 9.997775702521965e-07, "loss": 0.0015, "num_tokens": 847128.0, "reward": 0.708740234375, "reward_std": 0.014776019379496574, "rewards//mean": 0.708740234375, "rewards//std": 0.06943509727716446, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0198, "grad_norm": 2.945568561553955, "kl": 0.015174815838690847, "learning_rate": 9.997680052038434e-07, "loss": 0.0015, "num_tokens": 855824.0, "reward": 0.69830322265625, "reward_std": 0.014003828167915344, "rewards//mean": 0.69830322265625, "rewards//std": 0.04895626753568649, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.02, "grad_norm": 3.091007709503174, "kl": 0.01613260098383762, "learning_rate": 9.997582388489973e-07, "loss": 0.0016, "num_tokens": 864520.0, "reward": 0.70916748046875, "reward_std": 0.012781353667378426, "rewards//mean": 0.70916748046875, "rewards//std": 0.07657796889543533, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0202, "grad_norm": 2.9633381366729736, "kl": 0.01702519622631371, "learning_rate": 9.997482711915925e-07, "loss": 0.0017, "num_tokens": 873152.0, "reward": 0.7056884765625, "reward_std": 0.015273596160113811, "rewards//mean": 0.7056884765625, "rewards//std": 0.05478581786155701, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0204, "grad_norm": 3.0281424522399902, "kl": 0.01615259307436645, "learning_rate": 9.99738102235644e-07, "loss": 0.0016, "num_tokens": 881824.0, "reward": 0.70928955078125, "reward_std": 0.013487773947417736, "rewards//mean": 0.70928955078125, "rewards//std": 0.0424087829887867, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0206, "grad_norm": 3.10520076751709, "kl": 0.01550467952620238, "learning_rate": 9.997277319852474e-07, "loss": 0.0016, "num_tokens": 890368.0, "reward": 0.7510986328125, "reward_std": 0.014099751599133015, "rewards//mean": 0.7510986328125, "rewards//std": 0.04322708770632744, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0208, "grad_norm": 3.294306516647339, "kl": 0.018466165813151747, "learning_rate": 9.997171604445802e-07, "loss": 0.0018, "num_tokens": 899128.0, "reward": 0.75274658203125, "reward_std": 0.02102423831820488, "rewards//mean": 0.75274658203125, "rewards//std": 0.04623865336179733, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.021, "grad_norm": 2.8307101726531982, "kl": 0.017229145538294688, "learning_rate": 9.997063876179007e-07, "loss": 0.0017, "num_tokens": 907808.0, "reward": 0.716064453125, "reward_std": 0.015134226530790329, "rewards//mean": 0.716064453125, "rewards//std": 0.04795221611857414, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0212, "grad_norm": 3.230468273162842, "kl": 0.01487345719942823, "learning_rate": 9.996954135095478e-07, "loss": 0.0015, "num_tokens": 916384.0, "reward": 0.7489013671875, "reward_std": 0.015160983428359032, "rewards//mean": 0.7489013671875, "rewards//std": 0.04460042715072632, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0214, "grad_norm": 3.031060218811035, "kl": 0.0202089183148928, "learning_rate": 9.996842381239422e-07, "loss": 0.002, "num_tokens": 925000.0, "reward": 0.73760986328125, "reward_std": 0.01461248192936182, "rewards//mean": 0.73760986328125, "rewards//std": 0.028718072921037674, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0216, "grad_norm": 2.955716609954834, "kl": 0.017871063493657857, "learning_rate": 9.996728614655853e-07, "loss": 0.0018, "num_tokens": 933680.0, "reward": 0.72540283203125, "reward_std": 0.012798861600458622, "rewards//mean": 0.72540283203125, "rewards//std": 0.034496817737817764, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0218, "grad_norm": 3.2322795391082764, "kl": 0.01609462348278612, "learning_rate": 9.996612835390594e-07, "loss": 0.0016, "num_tokens": 942360.0, "reward": 0.7109375, "reward_std": 0.01488967053592205, "rewards//mean": 0.7109375, "rewards//std": 0.05337861180305481, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.022, "grad_norm": 3.2382678985595703, "kl": 0.02326445246580988, "learning_rate": 9.996495043490283e-07, "loss": 0.0023, "num_tokens": 951000.0, "reward": 0.75140380859375, "reward_std": 0.012074257247149944, "rewards//mean": 0.75140380859375, "rewards//std": 0.04795974865555763, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0222, "grad_norm": 3.04787278175354, "kl": 0.01807693997398019, "learning_rate": 9.996375239002368e-07, "loss": 0.0018, "num_tokens": 959688.0, "reward": 0.6810302734375, "reward_std": 0.014035972766578197, "rewards//mean": 0.6810302734375, "rewards//std": 0.06006631255149841, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0224, "grad_norm": 2.985187292098999, "kl": 0.02059358573751524, "learning_rate": 9.996253421975102e-07, "loss": 0.0021, "num_tokens": 968352.0, "reward": 0.73004150390625, "reward_std": 0.012494131922721863, "rewards//mean": 0.73004150390625, "rewards//std": 0.04424411430954933, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0226, "grad_norm": 2.9988229274749756, "kl": 0.021581392036750913, "learning_rate": 9.996129592457556e-07, "loss": 0.0022, "num_tokens": 976936.0, "reward": 0.7276611328125, "reward_std": 0.013638101518154144, "rewards//mean": 0.7276611328125, "rewards//std": 0.04925578832626343, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0228, "grad_norm": 3.142057418823242, "kl": 0.021858555090148002, "learning_rate": 9.996003750499607e-07, "loss": 0.0022, "num_tokens": 985552.0, "reward": 0.7265625, "reward_std": 0.016436271369457245, "rewards//mean": 0.7265625, "rewards//std": 0.04292645305395126, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.023, "grad_norm": 2.978050470352173, "kl": 0.02196191088296473, "learning_rate": 9.995875896151944e-07, "loss": 0.0022, "num_tokens": 994264.0, "reward": 0.72979736328125, "reward_std": 0.012108192779123783, "rewards//mean": 0.72979736328125, "rewards//std": 0.036838702857494354, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0232, "grad_norm": 2.9693009853363037, "kl": 0.02642357745207846, "learning_rate": 9.99574602946607e-07, "loss": 0.0026, "num_tokens": 1002832.0, "reward": 0.72235107421875, "reward_std": 0.012415243312716484, "rewards//mean": 0.72235107421875, "rewards//std": 0.03544308617711067, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0234, "grad_norm": 3.0097129344940186, "kl": 0.023999035358428955, "learning_rate": 9.99561415049429e-07, "loss": 0.0024, "num_tokens": 1011448.0, "reward": 0.72015380859375, "reward_std": 0.015053506940603256, "rewards//mean": 0.72015380859375, "rewards//std": 0.03507256507873535, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0236, "grad_norm": 2.9691386222839355, "kl": 0.022306351806037128, "learning_rate": 9.99548025928973e-07, "loss": 0.0022, "num_tokens": 1020104.0, "reward": 0.7171630859375, "reward_std": 0.01403750479221344, "rewards//mean": 0.7171630859375, "rewards//std": 0.03787709400057793, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0238, "grad_norm": 3.0105018615722656, "kl": 0.01927804498700425, "learning_rate": 9.995344355906318e-07, "loss": 0.0019, "num_tokens": 1028696.0, "reward": 0.72906494140625, "reward_std": 0.01614948734641075, "rewards//mean": 0.72906494140625, "rewards//std": 0.02800769917666912, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.024, "grad_norm": 3.310129404067993, "kl": 0.022020738862920552, "learning_rate": 9.995206440398796e-07, "loss": 0.0022, "num_tokens": 1037384.0, "reward": 0.70855712890625, "reward_std": 0.012240133248269558, "rewards//mean": 0.70855712890625, "rewards//std": 0.038188494741916656, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0242, "grad_norm": 3.0024490356445312, "kl": 0.020602340518962592, "learning_rate": 9.995066512822718e-07, "loss": 0.0021, "num_tokens": 1046176.0, "reward": 0.713623046875, "reward_std": 0.007958738133311272, "rewards//mean": 0.713623046875, "rewards//std": 0.04178783670067787, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0244, "grad_norm": 3.0775814056396484, "kl": 0.027243567805271596, "learning_rate": 9.994924573234446e-07, "loss": 0.0027, "num_tokens": 1054816.0, "reward": 0.71746826171875, "reward_std": 0.016701359301805496, "rewards//mean": 0.71746826171875, "rewards//std": 0.058855876326560974, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0246, "grad_norm": 3.011335611343384, "kl": 0.02239358614315279, "learning_rate": 9.994780621691154e-07, "loss": 0.0022, "num_tokens": 1063496.0, "reward": 0.7353515625, "reward_std": 0.013828590512275696, "rewards//mean": 0.7353515625, "rewards//std": 0.03586220741271973, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0248, "grad_norm": 3.2017664909362793, "kl": 0.02780767437070608, "learning_rate": 9.994634658250824e-07, "loss": 0.0028, "num_tokens": 1072104.0, "reward": 0.743408203125, "reward_std": 0.012461268343031406, "rewards//mean": 0.743408203125, "rewards//std": 0.04123353585600853, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.025, "grad_norm": 3.2028744220733643, "kl": 0.022625074605457485, "learning_rate": 9.994486682972252e-07, "loss": 0.0023, "num_tokens": 1080752.0, "reward": 0.7366943359375, "reward_std": 0.014579675160348415, "rewards//mean": 0.7366943359375, "rewards//std": 0.038674402981996536, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0252, "grad_norm": 2.7825567722320557, "kl": 0.025536290602758527, "learning_rate": 9.99433669591504e-07, "loss": 0.0026, "num_tokens": 1089368.0, "reward": 0.7174072265625, "reward_std": 0.013983946293592453, "rewards//mean": 0.7174072265625, "rewards//std": 0.05010770633816719, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0254, "grad_norm": 2.904855966567993, "kl": 0.026453224942088127, "learning_rate": 9.994184697139604e-07, "loss": 0.0026, "num_tokens": 1097992.0, "reward": 0.74237060546875, "reward_std": 0.013720160350203514, "rewards//mean": 0.74237060546875, "rewards//std": 0.026350749656558037, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0256, "grad_norm": 3.1194891929626465, "kl": 0.026587890926748514, "learning_rate": 9.99403068670717e-07, "loss": 0.0027, "num_tokens": 1106576.0, "reward": 0.7259521484375, "reward_std": 0.01477651484310627, "rewards//mean": 0.7259521484375, "rewards//std": 0.03868379816412926, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0258, "grad_norm": 2.9864237308502197, "kl": 0.03235742053948343, "learning_rate": 9.993874664679772e-07, "loss": 0.0032, "num_tokens": 1115160.0, "reward": 0.71636962890625, "reward_std": 0.013840307481586933, "rewards//mean": 0.71636962890625, "rewards//std": 0.043076373636722565, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.026, "grad_norm": 3.104255199432373, "kl": 0.02509330166503787, "learning_rate": 9.993716631120258e-07, "loss": 0.0025, "num_tokens": 1123808.0, "reward": 0.72076416015625, "reward_std": 0.01157199963927269, "rewards//mean": 0.72076416015625, "rewards//std": 0.04962088167667389, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0262, "grad_norm": 2.942652702331543, "kl": 0.031184019171632826, "learning_rate": 9.99355658609228e-07, "loss": 0.0031, "num_tokens": 1132528.0, "reward": 0.7171630859375, "reward_std": 0.015324447304010391, "rewards//mean": 0.7171630859375, "rewards//std": 0.04991641268134117, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0264, "grad_norm": 2.9444375038146973, "kl": 0.03304997179657221, "learning_rate": 9.993394529660306e-07, "loss": 0.0033, "num_tokens": 1141160.0, "reward": 0.736328125, "reward_std": 0.013412285596132278, "rewards//mean": 0.736328125, "rewards//std": 0.032526031136512756, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0266, "grad_norm": 3.2348594665527344, "kl": 0.03439075197093189, "learning_rate": 9.993230461889615e-07, "loss": 0.0034, "num_tokens": 1149744.0, "reward": 0.72637939453125, "reward_std": 0.015599433332681656, "rewards//mean": 0.72637939453125, "rewards//std": 0.03849054127931595, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0268, "grad_norm": 3.5405704975128174, "kl": 0.03284156124573201, "learning_rate": 9.993064382846289e-07, "loss": 0.0033, "num_tokens": 1158344.0, "reward": 0.72247314453125, "reward_std": 0.015042467974126339, "rewards//mean": 0.72247314453125, "rewards//std": 0.04495181515812874, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.027, "grad_norm": 3.0331664085388184, "kl": 0.030899111938197166, "learning_rate": 9.992896292597228e-07, "loss": 0.0031, "num_tokens": 1166920.0, "reward": 0.68408203125, "reward_std": 0.018408963456749916, "rewards//mean": 0.68408203125, "rewards//std": 0.03464268893003464, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0272, "grad_norm": 3.180854082107544, "kl": 0.03986567130777985, "learning_rate": 9.992726191210137e-07, "loss": 0.004, "num_tokens": 1175528.0, "reward": 0.73486328125, "reward_std": 0.01281731203198433, "rewards//mean": 0.73486328125, "rewards//std": 0.04209749773144722, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0274, "grad_norm": 2.917390823364258, "kl": 0.031310008256696165, "learning_rate": 9.992554078753533e-07, "loss": 0.0031, "num_tokens": 1184144.0, "reward": 0.7275390625, "reward_std": 0.016602514311671257, "rewards//mean": 0.7275390625, "rewards//std": 0.05057813972234726, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0276, "grad_norm": 2.7335574626922607, "kl": 0.030509869335219264, "learning_rate": 9.992379955296745e-07, "loss": 0.0031, "num_tokens": 1192832.0, "reward": 0.7412109375, "reward_std": 0.016559338197112083, "rewards//mean": 0.7412109375, "rewards//std": 0.043858595192432404, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0278, "grad_norm": 3.168146848678589, "kl": 0.04336894187144935, "learning_rate": 9.992203820909905e-07, "loss": 0.0043, "num_tokens": 1201472.0, "reward": 0.7071533203125, "reward_std": 0.012441834434866905, "rewards//mean": 0.7071533203125, "rewards//std": 0.056705132126808167, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.028, "grad_norm": 2.996670722961426, "kl": 0.03549187898170203, "learning_rate": 9.992025675663965e-07, "loss": 0.0035, "num_tokens": 1210176.0, "reward": 0.750732421875, "reward_std": 0.01398580614477396, "rewards//mean": 0.750732421875, "rewards//std": 0.03658350929617882, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0282, "grad_norm": 3.4842395782470703, "kl": 0.03604160330723971, "learning_rate": 9.991845519630676e-07, "loss": 0.0036, "num_tokens": 1218872.0, "reward": 0.73309326171875, "reward_std": 0.01453987043350935, "rewards//mean": 0.73309326171875, "rewards//std": 0.03210673853754997, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0284, "grad_norm": 3.1160051822662354, "kl": 0.03330606734380126, "learning_rate": 9.991663352882613e-07, "loss": 0.0033, "num_tokens": 1227624.0, "reward": 0.732666015625, "reward_std": 0.013473456725478172, "rewards//mean": 0.732666015625, "rewards//std": 0.043580908328294754, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0286, "grad_norm": 2.8810338973999023, "kl": 0.041373745538294315, "learning_rate": 9.991479175493148e-07, "loss": 0.0041, "num_tokens": 1236264.0, "reward": 0.7386474609375, "reward_std": 0.011745231226086617, "rewards//mean": 0.7386474609375, "rewards//std": 0.03809388726949692, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0288, "grad_norm": 3.0718138217926025, "kl": 0.035379409266170114, "learning_rate": 9.991292987536468e-07, "loss": 0.0035, "num_tokens": 1244984.0, "reward": 0.7083740234375, "reward_std": 0.01356479525566101, "rewards//mean": 0.7083740234375, "rewards//std": 0.07008899748325348, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.029, "grad_norm": 2.791430950164795, "kl": 0.034212324768304825, "learning_rate": 9.991104789087569e-07, "loss": 0.0034, "num_tokens": 1253544.0, "reward": 0.69757080078125, "reward_std": 0.013164354488253593, "rewards//mean": 0.69757080078125, "rewards//std": 0.04364160820841789, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0292, "grad_norm": 2.9670677185058594, "kl": 0.04070010129362345, "learning_rate": 9.990914580222255e-07, "loss": 0.0041, "num_tokens": 1262272.0, "reward": 0.75714111328125, "reward_std": 0.014646430499851704, "rewards//mean": 0.75714111328125, "rewards//std": 0.04350125044584274, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0294, "grad_norm": 3.1393849849700928, "kl": 0.03216705098748207, "learning_rate": 9.990722361017149e-07, "loss": 0.0032, "num_tokens": 1270984.0, "reward": 0.7379150390625, "reward_std": 0.015045834705233574, "rewards//mean": 0.7379150390625, "rewards//std": 0.0475710891187191, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0296, "grad_norm": 3.0052967071533203, "kl": 0.0336679095053114, "learning_rate": 9.990528131549671e-07, "loss": 0.0034, "num_tokens": 1279664.0, "reward": 0.72052001953125, "reward_std": 0.016495231539011, "rewards//mean": 0.72052001953125, "rewards//std": 0.04428686201572418, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0298, "grad_norm": 2.77880597114563, "kl": 0.04252167057711631, "learning_rate": 9.990331891898058e-07, "loss": 0.0043, "num_tokens": 1288360.0, "reward": 0.72564697265625, "reward_std": 0.013310113921761513, "rewards//mean": 0.72564697265625, "rewards//std": 0.03551433980464935, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.03, "grad_norm": 2.8032565116882324, "kl": 0.034908757312223315, "learning_rate": 9.990133642141357e-07, "loss": 0.0035, "num_tokens": 1297032.0, "reward": 0.74017333984375, "reward_std": 0.011333253234624863, "rewards//mean": 0.74017333984375, "rewards//std": 0.03461815416812897, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0302, "grad_norm": 2.9779255390167236, "kl": 0.03928355360403657, "learning_rate": 9.989933382359422e-07, "loss": 0.0039, "num_tokens": 1305632.0, "reward": 0.712158203125, "reward_std": 0.012044407427310944, "rewards//mean": 0.712158203125, "rewards//std": 0.040784675627946854, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0304, "grad_norm": 3.1361095905303955, "kl": 0.04866291838698089, "learning_rate": 9.989731112632916e-07, "loss": 0.0049, "num_tokens": 1314272.0, "reward": 0.72283935546875, "reward_std": 0.013317112810909748, "rewards//mean": 0.72283935546875, "rewards//std": 0.04381676763296127, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0306, "grad_norm": 2.8830525875091553, "kl": 0.04183753626421094, "learning_rate": 9.989526833043316e-07, "loss": 0.0042, "num_tokens": 1322960.0, "reward": 0.76031494140625, "reward_std": 0.01955876313149929, "rewards//mean": 0.76031494140625, "rewards//std": 0.05183799937367439, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0308, "grad_norm": 2.8472084999084473, "kl": 0.04606934660114348, "learning_rate": 9.989320543672903e-07, "loss": 0.0046, "num_tokens": 1331608.0, "reward": 0.73382568359375, "reward_std": 0.016142776235938072, "rewards//mean": 0.73382568359375, "rewards//std": 0.054491691291332245, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.031, "grad_norm": 2.762299060821533, "kl": 0.03941800841130316, "learning_rate": 9.989112244604771e-07, "loss": 0.0039, "num_tokens": 1340352.0, "reward": 0.73822021484375, "reward_std": 0.013351533561944962, "rewards//mean": 0.73822021484375, "rewards//std": 0.04714544862508774, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0312, "grad_norm": 3.0830838680267334, "kl": 0.03498400142416358, "learning_rate": 9.988901935922825e-07, "loss": 0.0035, "num_tokens": 1349024.0, "reward": 0.72576904296875, "reward_std": 0.013017626479268074, "rewards//mean": 0.72576904296875, "rewards//std": 0.03293436020612717, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0314, "grad_norm": 3.0565831661224365, "kl": 0.04686063149711117, "learning_rate": 9.988689617711776e-07, "loss": 0.0047, "num_tokens": 1357544.0, "reward": 0.73284912109375, "reward_std": 0.013014718890190125, "rewards//mean": 0.73284912109375, "rewards//std": 0.0456744059920311, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0316, "grad_norm": 2.797093629837036, "kl": 0.05101523862686008, "learning_rate": 9.988475290057143e-07, "loss": 0.0051, "num_tokens": 1366224.0, "reward": 0.734130859375, "reward_std": 0.011036617681384087, "rewards//mean": 0.734130859375, "rewards//std": 0.04602967947721481, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0318, "grad_norm": 3.194139242172241, "kl": 0.05134878121316433, "learning_rate": 9.988258953045262e-07, "loss": 0.0051, "num_tokens": 1374848.0, "reward": 0.73370361328125, "reward_std": 0.01682290807366371, "rewards//mean": 0.73370361328125, "rewards//std": 0.043828513473272324, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.032, "grad_norm": 3.09578537940979, "kl": 0.0347396379802376, "learning_rate": 9.988040606763272e-07, "loss": 0.0035, "num_tokens": 1383456.0, "reward": 0.69970703125, "reward_std": 0.014353256672620773, "rewards//mean": 0.69970703125, "rewards//std": 0.05614941567182541, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0322, "grad_norm": 3.0768558979034424, "kl": 0.0460287892492488, "learning_rate": 9.98782025129912e-07, "loss": 0.0046, "num_tokens": 1392112.0, "reward": 0.72900390625, "reward_std": 0.01641533523797989, "rewards//mean": 0.72900390625, "rewards//std": 0.0477275513112545, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0324, "grad_norm": 2.6946260929107666, "kl": 0.04967822623439133, "learning_rate": 9.987597886741568e-07, "loss": 0.005, "num_tokens": 1400784.0, "reward": 0.75042724609375, "reward_std": 0.013429421000182629, "rewards//mean": 0.75042724609375, "rewards//std": 0.04385371878743172, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0326, "grad_norm": 2.928901195526123, "kl": 0.04909839539323002, "learning_rate": 9.987373513180184e-07, "loss": 0.0049, "num_tokens": 1409344.0, "reward": 0.7476806640625, "reward_std": 0.013493198901414871, "rewards//mean": 0.7476806640625, "rewards//std": 0.03423633053898811, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0328, "grad_norm": 2.9144558906555176, "kl": 0.050017297733575106, "learning_rate": 9.987147130705347e-07, "loss": 0.005, "num_tokens": 1417920.0, "reward": 0.73626708984375, "reward_std": 0.01147711556404829, "rewards//mean": 0.73626708984375, "rewards//std": 0.0361848808825016, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.033, "grad_norm": 3.2065091133117676, "kl": 0.05320246773771942, "learning_rate": 9.98691873940824e-07, "loss": 0.0053, "num_tokens": 1426608.0, "reward": 0.72613525390625, "reward_std": 0.016028691083192825, "rewards//mean": 0.72613525390625, "rewards//std": 0.040774233639240265, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0332, "grad_norm": 2.9466049671173096, "kl": 0.05405107664410025, "learning_rate": 9.98668833938086e-07, "loss": 0.0054, "num_tokens": 1435216.0, "reward": 0.7215576171875, "reward_std": 0.015404738485813141, "rewards//mean": 0.7215576171875, "rewards//std": 0.03980853036046028, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0334, "grad_norm": 2.866468667984009, "kl": 0.044184350059367716, "learning_rate": 9.986455930716016e-07, "loss": 0.0044, "num_tokens": 1443832.0, "reward": 0.6962890625, "reward_std": 0.014004740864038467, "rewards//mean": 0.6962890625, "rewards//std": 0.0615786537528038, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0336, "grad_norm": 2.7617156505584717, "kl": 0.0638790549710393, "learning_rate": 9.986221513507318e-07, "loss": 0.0064, "num_tokens": 1452488.0, "reward": 0.7462158203125, "reward_std": 0.014732494950294495, "rewards//mean": 0.7462158203125, "rewards//std": 0.038838449865579605, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0338, "grad_norm": 2.615412950515747, "kl": 0.047671781037934124, "learning_rate": 9.985985087849191e-07, "loss": 0.0048, "num_tokens": 1461184.0, "reward": 0.740966796875, "reward_std": 0.010601690039038658, "rewards//mean": 0.740966796875, "rewards//std": 0.04212266206741333, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.034, "grad_norm": 2.4398818016052246, "kl": 0.04848790564574301, "learning_rate": 9.985746653836866e-07, "loss": 0.0048, "num_tokens": 1469920.0, "reward": 0.74945068359375, "reward_std": 0.0126343360170722, "rewards//mean": 0.74945068359375, "rewards//std": 0.05037597566843033, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0342, "grad_norm": 2.9143829345703125, "kl": 0.05843106552492827, "learning_rate": 9.985506211566386e-07, "loss": 0.0058, "num_tokens": 1478560.0, "reward": 0.722412109375, "reward_std": 0.015987034887075424, "rewards//mean": 0.722412109375, "rewards//std": 0.04930692911148071, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0344, "grad_norm": 2.8919947147369385, "kl": 0.04990722984075546, "learning_rate": 9.9852637611346e-07, "loss": 0.005, "num_tokens": 1487232.0, "reward": 0.6947021484375, "reward_std": 0.012111399322748184, "rewards//mean": 0.6947021484375, "rewards//std": 0.05466189235448837, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0346, "grad_norm": 2.790154218673706, "kl": 0.06499220291152596, "learning_rate": 9.98501930263917e-07, "loss": 0.0065, "num_tokens": 1495848.0, "reward": 0.71551513671875, "reward_std": 0.014181406237185001, "rewards//mean": 0.71551513671875, "rewards//std": 0.04905419051647186, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0348, "grad_norm": 2.9558165073394775, "kl": 0.0698660952039063, "learning_rate": 9.984772836178556e-07, "loss": 0.007, "num_tokens": 1504680.0, "reward": 0.73541259765625, "reward_std": 0.013052749447524548, "rewards//mean": 0.73541259765625, "rewards//std": 0.04531572014093399, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.035, "grad_norm": 2.8896403312683105, "kl": 0.06764502776786685, "learning_rate": 9.984524361852043e-07, "loss": 0.0068, "num_tokens": 1513360.0, "reward": 0.712890625, "reward_std": 0.009989009238779545, "rewards//mean": 0.712890625, "rewards//std": 0.05103578418493271, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0352, "grad_norm": 2.751380443572998, "kl": 0.06374173518270254, "learning_rate": 9.984273879759712e-07, "loss": 0.0064, "num_tokens": 1522112.0, "reward": 0.73516845703125, "reward_std": 0.01333437766879797, "rewards//mean": 0.73516845703125, "rewards//std": 0.03761053830385208, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0354, "grad_norm": 3.128718137741089, "kl": 0.07074581575579941, "learning_rate": 9.984021390002457e-07, "loss": 0.0071, "num_tokens": 1530848.0, "reward": 0.70831298828125, "reward_std": 0.011916648596525192, "rewards//mean": 0.70831298828125, "rewards//std": 0.043787047266960144, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0356, "grad_norm": 3.010329246520996, "kl": 0.07081070146523416, "learning_rate": 9.983766892681985e-07, "loss": 0.0071, "num_tokens": 1539528.0, "reward": 0.7213134765625, "reward_std": 0.014988021925091743, "rewards//mean": 0.7213134765625, "rewards//std": 0.031827643513679504, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0358, "grad_norm": 2.772942304611206, "kl": 0.06943181017413735, "learning_rate": 9.983510387900802e-07, "loss": 0.0069, "num_tokens": 1548192.0, "reward": 0.70550537109375, "reward_std": 0.011353434063494205, "rewards//mean": 0.70550537109375, "rewards//std": 0.05040841922163963, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.036, "grad_norm": 2.921886682510376, "kl": 0.07014578208327293, "learning_rate": 9.983251875762232e-07, "loss": 0.007, "num_tokens": 1556856.0, "reward": 0.7374267578125, "reward_std": 0.015500213950872421, "rewards//mean": 0.7374267578125, "rewards//std": 0.04746277630329132, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0362, "grad_norm": 2.5986506938934326, "kl": 0.06503755692392588, "learning_rate": 9.982991356370403e-07, "loss": 0.0065, "num_tokens": 1565488.0, "reward": 0.7337646484375, "reward_std": 0.012532995082437992, "rewards//mean": 0.7337646484375, "rewards//std": 0.04871561378240585, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0364, "grad_norm": 2.6686618328094482, "kl": 0.07410385878756642, "learning_rate": 9.98272882983025e-07, "loss": 0.0074, "num_tokens": 1574184.0, "reward": 0.73846435546875, "reward_std": 0.01319533959031105, "rewards//mean": 0.73846435546875, "rewards//std": 0.05068543553352356, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0366, "grad_norm": 3.029433250427246, "kl": 0.07510905456729233, "learning_rate": 9.982464296247522e-07, "loss": 0.0075, "num_tokens": 1582888.0, "reward": 0.735107421875, "reward_std": 0.013636925257742405, "rewards//mean": 0.735107421875, "rewards//std": 0.03843037039041519, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0368, "grad_norm": 2.4225409030914307, "kl": 0.07150726299732924, "learning_rate": 9.98219775572877e-07, "loss": 0.0072, "num_tokens": 1591408.0, "reward": 0.75115966796875, "reward_std": 0.014323122799396515, "rewards//mean": 0.75115966796875, "rewards//std": 0.036202866584062576, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.037, "grad_norm": 2.533130168914795, "kl": 0.07464027963578701, "learning_rate": 9.981929208381357e-07, "loss": 0.0075, "num_tokens": 1600088.0, "reward": 0.76458740234375, "reward_std": 0.014717087149620056, "rewards//mean": 0.76458740234375, "rewards//std": 0.04011625796556473, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0372, "grad_norm": 2.432730197906494, "kl": 0.0943055716343224, "learning_rate": 9.981658654313456e-07, "loss": 0.0094, "num_tokens": 1608712.0, "reward": 0.7225341796875, "reward_std": 0.0087115578353405, "rewards//mean": 0.7225341796875, "rewards//std": 0.04055297002196312, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0374, "grad_norm": 2.751760482788086, "kl": 0.08527720882557333, "learning_rate": 9.981386093634045e-07, "loss": 0.0085, "num_tokens": 1617400.0, "reward": 0.75360107421875, "reward_std": 0.013320360332727432, "rewards//mean": 0.75360107421875, "rewards//std": 0.028614573180675507, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0376, "grad_norm": 2.9692792892456055, "kl": 0.07570782792754471, "learning_rate": 9.98111152645291e-07, "loss": 0.0076, "num_tokens": 1625992.0, "reward": 0.76275634765625, "reward_std": 0.015970878303050995, "rewards//mean": 0.76275634765625, "rewards//std": 0.037092190235853195, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0378, "grad_norm": 3.0419697761535645, "kl": 0.06978122459258884, "learning_rate": 9.98083495288065e-07, "loss": 0.007, "num_tokens": 1634576.0, "reward": 0.7088623046875, "reward_std": 0.01774086058139801, "rewards//mean": 0.7088623046875, "rewards//std": 0.04886329919099808, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.038, "grad_norm": 2.7983527183532715, "kl": 0.06901918211951852, "learning_rate": 9.980556373028665e-07, "loss": 0.0069, "num_tokens": 1643200.0, "reward": 0.737548828125, "reward_std": 0.012527244165539742, "rewards//mean": 0.737548828125, "rewards//std": 0.038499634712934494, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0382, "grad_norm": 2.493257522583008, "kl": 0.08206672128289938, "learning_rate": 9.98027578700917e-07, "loss": 0.0082, "num_tokens": 1651848.0, "reward": 0.70538330078125, "reward_std": 0.01150240283459425, "rewards//mean": 0.70538330078125, "rewards//std": 0.03535756468772888, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0384, "grad_norm": 2.7214250564575195, "kl": 0.08754912205040455, "learning_rate": 9.979993194935182e-07, "loss": 0.0088, "num_tokens": 1660472.0, "reward": 0.73846435546875, "reward_std": 0.013667328283190727, "rewards//mean": 0.73846435546875, "rewards//std": 0.04352281987667084, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0386, "grad_norm": 2.9392783641815186, "kl": 0.07446632068604231, "learning_rate": 9.979708596920529e-07, "loss": 0.0074, "num_tokens": 1669128.0, "reward": 0.750244140625, "reward_std": 0.015270931646227837, "rewards//mean": 0.750244140625, "rewards//std": 0.030583124607801437, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0388, "grad_norm": 3.087444543838501, "kl": 0.08147471048869193, "learning_rate": 9.97942199307985e-07, "loss": 0.0081, "num_tokens": 1677784.0, "reward": 0.7628173828125, "reward_std": 0.014745705761015415, "rewards//mean": 0.7628173828125, "rewards//std": 0.04542369768023491, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.039, "grad_norm": 2.8398241996765137, "kl": 0.0784428627230227, "learning_rate": 9.97913338352859e-07, "loss": 0.0078, "num_tokens": 1686448.0, "reward": 0.735107421875, "reward_std": 0.012391097843647003, "rewards//mean": 0.735107421875, "rewards//std": 0.030039772391319275, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0392, "grad_norm": 2.49025821685791, "kl": 0.08109743148088455, "learning_rate": 9.978842768382998e-07, "loss": 0.0081, "num_tokens": 1695072.0, "reward": 0.70452880859375, "reward_std": 0.013618113473057747, "rewards//mean": 0.70452880859375, "rewards//std": 0.03166856989264488, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0394, "grad_norm": 3.270029306411743, "kl": 0.08498156163841486, "learning_rate": 9.978550147760131e-07, "loss": 0.0085, "num_tokens": 1703680.0, "reward": 0.73675537109375, "reward_std": 0.017492208629846573, "rewards//mean": 0.73675537109375, "rewards//std": 0.0403052382171154, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0396, "grad_norm": 2.5561161041259766, "kl": 0.0859982690308243, "learning_rate": 9.978255521777862e-07, "loss": 0.0086, "num_tokens": 1712304.0, "reward": 0.73126220703125, "reward_std": 0.0102681340649724, "rewards//mean": 0.73126220703125, "rewards//std": 0.045068852603435516, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0398, "grad_norm": 3.171823263168335, "kl": 0.07568260550033301, "learning_rate": 9.977958890554866e-07, "loss": 0.0076, "num_tokens": 1720936.0, "reward": 0.71221923828125, "reward_std": 0.01587653160095215, "rewards//mean": 0.71221923828125, "rewards//std": 0.03565939515829086, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.04, "grad_norm": 2.9293112754821777, "kl": 0.07837174762971699, "learning_rate": 9.97766025421062e-07, "loss": 0.0078, "num_tokens": 1729552.0, "reward": 0.72186279296875, "reward_std": 0.014022290706634521, "rewards//mean": 0.72186279296875, "rewards//std": 0.03507126867771149, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0402, "grad_norm": 2.8085222244262695, "kl": 0.08167064702138305, "learning_rate": 9.977359612865422e-07, "loss": 0.0082, "num_tokens": 1738184.0, "reward": 0.71942138671875, "reward_std": 0.012507260777056217, "rewards//mean": 0.71942138671875, "rewards//std": 0.0408717580139637, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0404, "grad_norm": 3.0235538482666016, "kl": 0.07556112413294613, "learning_rate": 9.977056966640367e-07, "loss": 0.0076, "num_tokens": 1746792.0, "reward": 0.73321533203125, "reward_std": 0.01591806672513485, "rewards//mean": 0.73321533203125, "rewards//std": 0.0390729159116745, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0406, "grad_norm": 2.861449718475342, "kl": 0.07926510332617909, "learning_rate": 9.976752315657359e-07, "loss": 0.0079, "num_tokens": 1755408.0, "reward": 0.744140625, "reward_std": 0.01228757668286562, "rewards//mean": 0.744140625, "rewards//std": 0.03467413783073425, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0408, "grad_norm": 3.436288595199585, "kl": 0.09631677670404315, "learning_rate": 9.976445660039117e-07, "loss": 0.0096, "num_tokens": 1764008.0, "reward": 0.74468994140625, "reward_std": 0.012933210469782352, "rewards//mean": 0.74468994140625, "rewards//std": 0.03882846236228943, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.041, "grad_norm": 2.9841854572296143, "kl": 0.09829538897611201, "learning_rate": 9.976136999909155e-07, "loss": 0.0098, "num_tokens": 1772688.0, "reward": 0.7412109375, "reward_std": 0.010125808417797089, "rewards//mean": 0.7412109375, "rewards//std": 0.041701529175043106, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0412, "grad_norm": 3.1280481815338135, "kl": 0.09879924496635795, "learning_rate": 9.975826335391805e-07, "loss": 0.0099, "num_tokens": 1781256.0, "reward": 0.74725341796875, "reward_std": 0.01598658226430416, "rewards//mean": 0.74725341796875, "rewards//std": 0.041038088500499725, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0414, "grad_norm": 2.806591272354126, "kl": 0.08446787379216403, "learning_rate": 9.975513666612203e-07, "loss": 0.0084, "num_tokens": 1789976.0, "reward": 0.74365234375, "reward_std": 0.01498311199247837, "rewards//mean": 0.74365234375, "rewards//std": 0.05660908296704292, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0416, "grad_norm": 2.473963737487793, "kl": 0.09275762271136045, "learning_rate": 9.975198993696291e-07, "loss": 0.0093, "num_tokens": 1798664.0, "reward": 0.7291259765625, "reward_std": 0.013060636818408966, "rewards//mean": 0.7291259765625, "rewards//std": 0.04006778821349144, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0418, "grad_norm": 2.5815858840942383, "kl": 0.09844274073839188, "learning_rate": 9.97488231677082e-07, "loss": 0.0098, "num_tokens": 1807424.0, "reward": 0.68670654296875, "reward_std": 0.013432216830551624, "rewards//mean": 0.68670654296875, "rewards//std": 0.04699461907148361, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.042, "grad_norm": 2.440018892288208, "kl": 0.09588717669248581, "learning_rate": 9.974563635963347e-07, "loss": 0.0096, "num_tokens": 1816088.0, "reward": 0.72821044921875, "reward_std": 0.009933840483427048, "rewards//mean": 0.72821044921875, "rewards//std": 0.04099749028682709, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0422, "grad_norm": 2.6709070205688477, "kl": 0.09110309137031436, "learning_rate": 9.974242951402235e-07, "loss": 0.0091, "num_tokens": 1824672.0, "reward": 0.69232177734375, "reward_std": 0.011585031636059284, "rewards//mean": 0.69232177734375, "rewards//std": 0.046945951879024506, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0424, "grad_norm": 3.0153114795684814, "kl": 0.10191798605956137, "learning_rate": 9.973920263216657e-07, "loss": 0.0102, "num_tokens": 1833248.0, "reward": 0.7841796875, "reward_std": 0.014959658496081829, "rewards//mean": 0.7841796875, "rewards//std": 0.03194751217961311, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0426, "grad_norm": 2.5440561771392822, "kl": 0.0974107151851058, "learning_rate": 9.97359557153659e-07, "loss": 0.0097, "num_tokens": 1841808.0, "reward": 0.7314453125, "reward_std": 0.009747691452503204, "rewards//mean": 0.7314453125, "rewards//std": 0.03378254920244217, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0428, "grad_norm": 3.8489737510681152, "kl": 0.0931732514873147, "learning_rate": 9.973268876492825e-07, "loss": 0.0093, "num_tokens": 1850392.0, "reward": 0.7232666015625, "reward_std": 0.019548147916793823, "rewards//mean": 0.7232666015625, "rewards//std": 0.04620479792356491, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.043, "grad_norm": 3.214967966079712, "kl": 0.09402831085026264, "learning_rate": 9.972940178216952e-07, "loss": 0.0094, "num_tokens": 1859016.0, "reward": 0.7628173828125, "reward_std": 0.010977610945701599, "rewards//mean": 0.7628173828125, "rewards//std": 0.04047824442386627, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0432, "grad_norm": 3.1702773571014404, "kl": 0.09994646161794662, "learning_rate": 9.972609476841365e-07, "loss": 0.01, "num_tokens": 1867616.0, "reward": 0.739990234375, "reward_std": 0.008601821959018707, "rewards//mean": 0.739990234375, "rewards//std": 0.022733250632882118, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0434, "grad_norm": 2.3969881534576416, "kl": 0.09891431382857263, "learning_rate": 9.97227677249928e-07, "loss": 0.0099, "num_tokens": 1876296.0, "reward": 0.74725341796875, "reward_std": 0.013425001874566078, "rewards//mean": 0.74725341796875, "rewards//std": 0.038598936051130295, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0436, "grad_norm": 2.8317806720733643, "kl": 0.10404033353552222, "learning_rate": 9.971942065324702e-07, "loss": 0.0104, "num_tokens": 1884904.0, "reward": 0.74896240234375, "reward_std": 0.01664729230105877, "rewards//mean": 0.74896240234375, "rewards//std": 0.03809085860848427, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0438, "grad_norm": 2.7183804512023926, "kl": 0.09752598311752081, "learning_rate": 9.971605355452457e-07, "loss": 0.0098, "num_tokens": 1893616.0, "reward": 0.7247314453125, "reward_std": 0.0124040637165308, "rewards//mean": 0.7247314453125, "rewards//std": 0.04993703216314316, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.044, "grad_norm": 3.6828420162200928, "kl": 0.10862272512167692, "learning_rate": 9.97126664301817e-07, "loss": 0.0109, "num_tokens": 1902160.0, "reward": 0.6983642578125, "reward_std": 0.01596534624695778, "rewards//mean": 0.6983642578125, "rewards//std": 0.04885586351156235, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0442, "grad_norm": 2.340144634246826, "kl": 0.10954310419037938, "learning_rate": 9.970925928158272e-07, "loss": 0.011, "num_tokens": 1910880.0, "reward": 0.731689453125, "reward_std": 0.01111997477710247, "rewards//mean": 0.731689453125, "rewards//std": 0.035861365497112274, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0444, "grad_norm": 3.0173556804656982, "kl": 0.11066217673942447, "learning_rate": 9.970583211010007e-07, "loss": 0.0111, "num_tokens": 1919640.0, "reward": 0.70489501953125, "reward_std": 0.015776721760630608, "rewards//mean": 0.70489501953125, "rewards//std": 0.0446137897670269, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0446, "grad_norm": 2.8475310802459717, "kl": 0.10389193054288626, "learning_rate": 9.970238491711415e-07, "loss": 0.0104, "num_tokens": 1928296.0, "reward": 0.72418212890625, "reward_std": 0.011221460998058319, "rewards//mean": 0.72418212890625, "rewards//std": 0.032448623329401016, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0448, "grad_norm": 2.4673573970794678, "kl": 0.10303299408406019, "learning_rate": 9.969891770401356e-07, "loss": 0.0103, "num_tokens": 1937088.0, "reward": 0.751708984375, "reward_std": 0.010914693586528301, "rewards//mean": 0.751708984375, "rewards//std": 0.03294682502746582, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.045, "grad_norm": 3.270146608352661, "kl": 0.11207640403881669, "learning_rate": 9.969543047219486e-07, "loss": 0.0112, "num_tokens": 1945688.0, "reward": 0.75653076171875, "reward_std": 0.016141315922141075, "rewards//mean": 0.75653076171875, "rewards//std": 0.0417482852935791, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0452, "grad_norm": 2.757606029510498, "kl": 0.10813413886353374, "learning_rate": 9.96919232230627e-07, "loss": 0.0108, "num_tokens": 1954320.0, "reward": 0.74822998046875, "reward_std": 0.012435732409358025, "rewards//mean": 0.74822998046875, "rewards//std": 0.039290811866521835, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0454, "grad_norm": 2.6634392738342285, "kl": 0.11925068125128746, "learning_rate": 9.968839595802981e-07, "loss": 0.0119, "num_tokens": 1962944.0, "reward": 0.716552734375, "reward_std": 0.010293394327163696, "rewards//mean": 0.716552734375, "rewards//std": 0.027611492201685905, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0456, "grad_norm": 3.6396124362945557, "kl": 0.12086705304682255, "learning_rate": 9.968484867851697e-07, "loss": 0.0121, "num_tokens": 1971624.0, "reward": 0.758544921875, "reward_std": 0.015222180634737015, "rewards//mean": 0.758544921875, "rewards//std": 0.05078781023621559, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0458, "grad_norm": 2.7004621028900146, "kl": 0.1087592770345509, "learning_rate": 9.968128138595302e-07, "loss": 0.0109, "num_tokens": 1980280.0, "reward": 0.71453857421875, "reward_std": 0.013990378007292747, "rewards//mean": 0.71453857421875, "rewards//std": 0.03948374465107918, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.046, "grad_norm": 3.056368589401245, "kl": 0.11676244903355837, "learning_rate": 9.967769408177488e-07, "loss": 0.0117, "num_tokens": 1988880.0, "reward": 0.69268798828125, "reward_std": 0.013287386856973171, "rewards//mean": 0.69268798828125, "rewards//std": 0.05184588208794594, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0462, "grad_norm": 2.9305176734924316, "kl": 0.1197158400900662, "learning_rate": 9.967408676742751e-07, "loss": 0.012, "num_tokens": 1997536.0, "reward": 0.74053955078125, "reward_std": 0.013810476288199425, "rewards//mean": 0.74053955078125, "rewards//std": 0.04055796191096306, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0464, "grad_norm": 3.2707579135894775, "kl": 0.13016177900135517, "learning_rate": 9.967045944436393e-07, "loss": 0.013, "num_tokens": 2006280.0, "reward": 0.72113037109375, "reward_std": 0.011619940400123596, "rewards//mean": 0.72113037109375, "rewards//std": 0.03844134882092476, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0466, "grad_norm": 3.191673517227173, "kl": 0.14259618474170566, "learning_rate": 9.96668121140452e-07, "loss": 0.0143, "num_tokens": 2015040.0, "reward": 0.756103515625, "reward_std": 0.016418365761637688, "rewards//mean": 0.756103515625, "rewards//std": 0.039689138531684875, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0468, "grad_norm": 3.194607973098755, "kl": 0.13389609195291996, "learning_rate": 9.966314477794052e-07, "loss": 0.0134, "num_tokens": 2023640.0, "reward": 0.7322998046875, "reward_std": 0.010810410603880882, "rewards//mean": 0.7322998046875, "rewards//std": 0.037917040288448334, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.047, "grad_norm": 2.7117807865142822, "kl": 0.1349084647372365, "learning_rate": 9.965945743752705e-07, "loss": 0.0135, "num_tokens": 2032216.0, "reward": 0.73382568359375, "reward_std": 0.011349475011229515, "rewards//mean": 0.73382568359375, "rewards//std": 0.048444636166095734, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0472, "grad_norm": 3.4043350219726562, "kl": 0.15029606316238642, "learning_rate": 9.965575009429005e-07, "loss": 0.015, "num_tokens": 2040856.0, "reward": 0.74774169921875, "reward_std": 0.015436086803674698, "rewards//mean": 0.74774169921875, "rewards//std": 0.04153744876384735, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0474, "grad_norm": 2.815829038619995, "kl": 0.1407718537375331, "learning_rate": 9.965202274972286e-07, "loss": 0.0141, "num_tokens": 2049408.0, "reward": 0.72216796875, "reward_std": 0.012544216588139534, "rewards//mean": 0.72216796875, "rewards//std": 0.0369827039539814, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0476, "grad_norm": 3.4487929344177246, "kl": 0.15310040256008506, "learning_rate": 9.964827540532684e-07, "loss": 0.0153, "num_tokens": 2058016.0, "reward": 0.72662353515625, "reward_std": 0.016283154487609863, "rewards//mean": 0.72662353515625, "rewards//std": 0.04449181258678436, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0478, "grad_norm": 3.5169429779052734, "kl": 0.1543840290978551, "learning_rate": 9.964450806261144e-07, "loss": 0.0154, "num_tokens": 2066648.0, "reward": 0.7509765625, "reward_std": 0.01590714603662491, "rewards//mean": 0.7509765625, "rewards//std": 0.034973207861185074, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.048, "grad_norm": 3.831986665725708, "kl": 0.1407278785482049, "learning_rate": 9.96407207230941e-07, "loss": 0.0141, "num_tokens": 2075336.0, "reward": 0.7237548828125, "reward_std": 0.012750649824738503, "rewards//mean": 0.7237548828125, "rewards//std": 0.040704984217882156, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0482, "grad_norm": 3.1790924072265625, "kl": 0.158443967346102, "learning_rate": 9.963691338830042e-07, "loss": 0.0158, "num_tokens": 2083952.0, "reward": 0.73724365234375, "reward_std": 0.015838809311389923, "rewards//mean": 0.73724365234375, "rewards//std": 0.024312397465109825, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0484, "grad_norm": 2.7173264026641846, "kl": 0.15428494522348046, "learning_rate": 9.963308605976396e-07, "loss": 0.0154, "num_tokens": 2092624.0, "reward": 0.76165771484375, "reward_std": 0.011489255353808403, "rewards//mean": 0.76165771484375, "rewards//std": 0.02435469999909401, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0486, "grad_norm": 2.8586461544036865, "kl": 0.1581531437113881, "learning_rate": 9.962923873902636e-07, "loss": 0.0158, "num_tokens": 2101160.0, "reward": 0.71075439453125, "reward_std": 0.012413685210049152, "rewards//mean": 0.71075439453125, "rewards//std": 0.04495181515812874, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0488, "grad_norm": 2.937629222869873, "kl": 0.16615721164271235, "learning_rate": 9.962537142763732e-07, "loss": 0.0166, "num_tokens": 2109792.0, "reward": 0.7449951171875, "reward_std": 0.012411234900355339, "rewards//mean": 0.7449951171875, "rewards//std": 0.024953732267022133, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.049, "grad_norm": 3.2936151027679443, "kl": 0.2063107956200838, "learning_rate": 9.962148412715463e-07, "loss": 0.0206, "num_tokens": 2118552.0, "reward": 0.75042724609375, "reward_std": 0.010296858847141266, "rewards//mean": 0.75042724609375, "rewards//std": 0.03866437450051308, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0492, "grad_norm": 3.623534917831421, "kl": 0.1343393293209374, "learning_rate": 9.961757683914405e-07, "loss": 0.0134, "num_tokens": 2127248.0, "reward": 0.69366455078125, "reward_std": 0.010699542239308357, "rewards//mean": 0.69366455078125, "rewards//std": 0.04388511925935745, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0494, "grad_norm": 4.162594795227051, "kl": 0.16983078233897686, "learning_rate": 9.961364956517946e-07, "loss": 0.017, "num_tokens": 2135896.0, "reward": 0.74365234375, "reward_std": 0.016386723145842552, "rewards//mean": 0.74365234375, "rewards//std": 0.04790989309549332, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0496, "grad_norm": 3.966630458831787, "kl": 0.18166909040883183, "learning_rate": 9.960970230684275e-07, "loss": 0.0182, "num_tokens": 2144536.0, "reward": 0.72308349609375, "reward_std": 0.014546409249305725, "rewards//mean": 0.72308349609375, "rewards//std": 0.05282726511359215, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0498, "grad_norm": 2.8995301723480225, "kl": 0.14150721998885274, "learning_rate": 9.960573506572389e-07, "loss": 0.0142, "num_tokens": 2153104.0, "reward": 0.74957275390625, "reward_std": 0.011014558374881744, "rewards//mean": 0.74957275390625, "rewards//std": 0.024101028218865395, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.05, "grad_norm": 3.355257511138916, "kl": 0.19822023855522275, "learning_rate": 9.960174784342087e-07, "loss": 0.0198, "num_tokens": 2161736.0, "reward": 0.7359619140625, "reward_std": 0.011475984007120132, "rewards//mean": 0.7359619140625, "rewards//std": 0.037581801414489746, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0502, "grad_norm": 3.375710964202881, "kl": 0.16241988725960255, "learning_rate": 9.959774064153975e-07, "loss": 0.0162, "num_tokens": 2170344.0, "reward": 0.71624755859375, "reward_std": 0.009172160178422928, "rewards//mean": 0.71624755859375, "rewards//std": 0.03990514203906059, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0504, "grad_norm": 3.359903573989868, "kl": 0.17989000072702765, "learning_rate": 9.959371346169465e-07, "loss": 0.018, "num_tokens": 2179056.0, "reward": 0.783935546875, "reward_std": 0.0145841920748353, "rewards//mean": 0.783935546875, "rewards//std": 0.030487943440675735, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0506, "grad_norm": 3.0368056297302246, "kl": 0.1985956854186952, "learning_rate": 9.95896663055077e-07, "loss": 0.0199, "num_tokens": 2187640.0, "reward": 0.709716796875, "reward_std": 0.011430484242737293, "rewards//mean": 0.709716796875, "rewards//std": 0.04344732314348221, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0508, "grad_norm": 2.8358800411224365, "kl": 0.1734612863510847, "learning_rate": 9.958559917460907e-07, "loss": 0.0173, "num_tokens": 2196336.0, "reward": 0.75439453125, "reward_std": 0.00935526005923748, "rewards//mean": 0.75439453125, "rewards//std": 0.03223055601119995, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.051, "grad_norm": 3.5464892387390137, "kl": 0.1810889858752489, "learning_rate": 9.958151207063703e-07, "loss": 0.0181, "num_tokens": 2205024.0, "reward": 0.74468994140625, "reward_std": 0.012457584962248802, "rewards//mean": 0.74468994140625, "rewards//std": 0.03684404492378235, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0512, "grad_norm": 5.931046485900879, "kl": 0.27709746547043324, "learning_rate": 9.957740499523785e-07, "loss": 0.0277, "num_tokens": 2213608.0, "reward": 0.74755859375, "reward_std": 0.013767718337476254, "rewards//mean": 0.74755859375, "rewards//std": 0.04670677334070206, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0514, "grad_norm": 3.762895107269287, "kl": 0.17110665002837777, "learning_rate": 9.957327795006588e-07, "loss": 0.0171, "num_tokens": 2222264.0, "reward": 0.77410888671875, "reward_std": 0.013452369719743729, "rewards//mean": 0.77410888671875, "rewards//std": 0.04332830756902695, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0516, "grad_norm": 4.586542129516602, "kl": 0.2573170200921595, "learning_rate": 9.956913093678348e-07, "loss": 0.0257, "num_tokens": 2230880.0, "reward": 0.71356201171875, "reward_std": 0.014903232455253601, "rewards//mean": 0.71356201171875, "rewards//std": 0.042870305478572845, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0518, "grad_norm": 3.6989188194274902, "kl": 0.19445497635751963, "learning_rate": 9.956496395706105e-07, "loss": 0.0194, "num_tokens": 2239608.0, "reward": 0.7647705078125, "reward_std": 0.013435694389045238, "rewards//mean": 0.7647705078125, "rewards//std": 0.03896608203649521, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.052, "grad_norm": 4.194847106933594, "kl": 0.22157821152359247, "learning_rate": 9.956077701257707e-07, "loss": 0.0222, "num_tokens": 2248296.0, "reward": 0.7449951171875, "reward_std": 0.013505147770047188, "rewards//mean": 0.7449951171875, "rewards//std": 0.026932932436466217, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0522, "grad_norm": 4.072581768035889, "kl": 0.2203886266797781, "learning_rate": 9.955657010501806e-07, "loss": 0.022, "num_tokens": 2256976.0, "reward": 0.7191162109375, "reward_std": 0.0126027287915349, "rewards//mean": 0.7191162109375, "rewards//std": 0.03821449726819992, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0524, "grad_norm": 4.167779922485352, "kl": 0.3016416598111391, "learning_rate": 9.955234323607851e-07, "loss": 0.0302, "num_tokens": 2265672.0, "reward": 0.75775146484375, "reward_std": 0.014491424895823002, "rewards//mean": 0.75775146484375, "rewards//std": 0.04258937016129494, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0526, "grad_norm": 3.6395280361175537, "kl": 0.2628649156540632, "learning_rate": 9.954809640746105e-07, "loss": 0.0263, "num_tokens": 2274336.0, "reward": 0.76348876953125, "reward_std": 0.012867321260273457, "rewards//mean": 0.76348876953125, "rewards//std": 0.039622291922569275, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0528, "grad_norm": 3.4668288230895996, "kl": 0.281156400218606, "learning_rate": 9.954382962087627e-07, "loss": 0.0281, "num_tokens": 2282984.0, "reward": 0.77264404296875, "reward_std": 0.013112173415720463, "rewards//mean": 0.77264404296875, "rewards//std": 0.02909199893474579, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.053, "grad_norm": 3.6522719860076904, "kl": 0.31555200181901455, "learning_rate": 9.953954287804284e-07, "loss": 0.0316, "num_tokens": 2291520.0, "reward": 0.71893310546875, "reward_std": 0.012379538267850876, "rewards//mean": 0.71893310546875, "rewards//std": 0.03527955710887909, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0532, "grad_norm": 4.363163948059082, "kl": 0.27224128041416407, "learning_rate": 9.953523618068748e-07, "loss": 0.0272, "num_tokens": 2300080.0, "reward": 0.72747802734375, "reward_std": 0.01283281296491623, "rewards//mean": 0.72747802734375, "rewards//std": 0.029771430417895317, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0534, "grad_norm": 4.9658308029174805, "kl": 0.42545278184115887, "learning_rate": 9.95309095305449e-07, "loss": 0.0425, "num_tokens": 2308656.0, "reward": 0.7386474609375, "reward_std": 0.01564006507396698, "rewards//mean": 0.7386474609375, "rewards//std": 0.03796651214361191, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0536, "grad_norm": 3.5091545581817627, "kl": 0.31685093883425, "learning_rate": 9.952656292935788e-07, "loss": 0.0317, "num_tokens": 2317368.0, "reward": 0.76239013671875, "reward_std": 0.01158678624778986, "rewards//mean": 0.76239013671875, "rewards//std": 0.03563561290502548, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0538, "grad_norm": 4.6175312995910645, "kl": 0.38975627813488245, "learning_rate": 9.952219637887725e-07, "loss": 0.039, "num_tokens": 2325992.0, "reward": 0.736083984375, "reward_std": 0.015487446449697018, "rewards//mean": 0.736083984375, "rewards//std": 0.03353699669241905, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.054, "grad_norm": 3.726688861846924, "kl": 0.3101207744330168, "learning_rate": 9.951780988086183e-07, "loss": 0.031, "num_tokens": 2334616.0, "reward": 0.76055908203125, "reward_std": 0.011042140424251556, "rewards//mean": 0.76055908203125, "rewards//std": 0.029295260086655617, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0542, "grad_norm": 4.996911525726318, "kl": 0.3400790123268962, "learning_rate": 9.95134034370785e-07, "loss": 0.034, "num_tokens": 2343264.0, "reward": 0.74603271484375, "reward_std": 0.013830387964844704, "rewards//mean": 0.74603271484375, "rewards//std": 0.03156227618455887, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0544, "grad_norm": 3.765669584274292, "kl": 0.4163520308211446, "learning_rate": 9.95089770493022e-07, "loss": 0.0416, "num_tokens": 2351936.0, "reward": 0.73028564453125, "reward_std": 0.01131827849894762, "rewards//mean": 0.73028564453125, "rewards//std": 0.047021668404340744, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0546, "grad_norm": 4.735191345214844, "kl": 0.3753213444724679, "learning_rate": 9.950453071931588e-07, "loss": 0.0375, "num_tokens": 2360560.0, "reward": 0.74945068359375, "reward_std": 0.013796941377222538, "rewards//mean": 0.74945068359375, "rewards//std": 0.0388689860701561, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0548, "grad_norm": 3.7093846797943115, "kl": 0.3756133262068033, "learning_rate": 9.950006444891048e-07, "loss": 0.0376, "num_tokens": 2369160.0, "reward": 0.70928955078125, "reward_std": 0.013247143477201462, "rewards//mean": 0.70928955078125, "rewards//std": 0.037219706922769547, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.055, "grad_norm": 4.347089767456055, "kl": 0.49243751261383295, "learning_rate": 9.949557823988506e-07, "loss": 0.0492, "num_tokens": 2377840.0, "reward": 0.73736572265625, "reward_std": 0.014801887795329094, "rewards//mean": 0.73736572265625, "rewards//std": 0.035927943885326385, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0552, "grad_norm": 3.7578723430633545, "kl": 0.6516364244744182, "learning_rate": 9.949107209404663e-07, "loss": 0.0652, "num_tokens": 2386504.0, "reward": 0.74981689453125, "reward_std": 0.012601152062416077, "rewards//mean": 0.74981689453125, "rewards//std": 0.024660447612404823, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0554, "grad_norm": 5.244663238525391, "kl": 0.5927953533828259, "learning_rate": 9.94865460132103e-07, "loss": 0.0593, "num_tokens": 2395128.0, "reward": 0.74749755859375, "reward_std": 0.02090274542570114, "rewards//mean": 0.74749755859375, "rewards//std": 0.03469066694378853, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0556, "grad_norm": 3.8761532306671143, "kl": 0.4448296641930938, "learning_rate": 9.948199999919912e-07, "loss": 0.0445, "num_tokens": 2403824.0, "reward": 0.71746826171875, "reward_std": 0.010741055011749268, "rewards//mean": 0.71746826171875, "rewards//std": 0.03245935216546059, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0558, "grad_norm": 4.406347751617432, "kl": 0.6288035763427615, "learning_rate": 9.947743405384428e-07, "loss": 0.0629, "num_tokens": 2412440.0, "reward": 0.7379150390625, "reward_std": 0.015601018443703651, "rewards//mean": 0.7379150390625, "rewards//std": 0.04014930874109268, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.056, "grad_norm": 4.828309059143066, "kl": 0.5129829635843635, "learning_rate": 9.947284817898492e-07, "loss": 0.0513, "num_tokens": 2421072.0, "reward": 0.70001220703125, "reward_std": 0.014600463211536407, "rewards//mean": 0.70001220703125, "rewards//std": 0.03711136430501938, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0562, "grad_norm": 5.15447473526001, "kl": 0.46403655782341957, "learning_rate": 9.946824237646824e-07, "loss": 0.0464, "num_tokens": 2429736.0, "reward": 0.737060546875, "reward_std": 0.013210458680987358, "rewards//mean": 0.737060546875, "rewards//std": 0.03836729750037193, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0564, "grad_norm": 4.505428791046143, "kl": 0.6311583276838064, "learning_rate": 9.946361664814943e-07, "loss": 0.0631, "num_tokens": 2438336.0, "reward": 0.73468017578125, "reward_std": 0.014126582071185112, "rewards//mean": 0.73468017578125, "rewards//std": 0.022727172821760178, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0566, "grad_norm": 3.6432912349700928, "kl": 0.38745086546987295, "learning_rate": 9.945897099589173e-07, "loss": 0.0387, "num_tokens": 2446944.0, "reward": 0.722412109375, "reward_std": 0.012055720202624798, "rewards//mean": 0.722412109375, "rewards//std": 0.030456148087978363, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0568, "grad_norm": 3.8316993713378906, "kl": 0.6913583185523748, "learning_rate": 9.945430542156646e-07, "loss": 0.0691, "num_tokens": 2455528.0, "reward": 0.75775146484375, "reward_std": 0.013003588654100895, "rewards//mean": 0.75775146484375, "rewards//std": 0.02943393401801586, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.057, "grad_norm": 4.167176246643066, "kl": 0.5367339439690113, "learning_rate": 9.944961992705286e-07, "loss": 0.0537, "num_tokens": 2464104.0, "reward": 0.7257080078125, "reward_std": 0.010289316065609455, "rewards//mean": 0.7257080078125, "rewards//std": 0.04251524433493614, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0572, "grad_norm": 4.4267401695251465, "kl": 0.7689229855313897, "learning_rate": 9.944491451423827e-07, "loss": 0.0769, "num_tokens": 2472768.0, "reward": 0.77447509765625, "reward_std": 0.012099775485694408, "rewards//mean": 0.77447509765625, "rewards//std": 0.025441773235797882, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0574, "grad_norm": 5.860213279724121, "kl": 0.6941496105864644, "learning_rate": 9.944018918501805e-07, "loss": 0.0694, "num_tokens": 2481432.0, "reward": 0.72503662109375, "reward_std": 0.01605089195072651, "rewards//mean": 0.72503662109375, "rewards//std": 0.03806779906153679, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0576, "grad_norm": 6.553688049316406, "kl": 0.5049006678164005, "learning_rate": 9.94354439412955e-07, "loss": 0.0505, "num_tokens": 2490120.0, "reward": 0.73065185546875, "reward_std": 0.013142431154847145, "rewards//mean": 0.73065185546875, "rewards//std": 0.025765178725123405, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0578, "grad_norm": 4.863574504852295, "kl": 0.3878084821626544, "learning_rate": 9.943067878498209e-07, "loss": 0.0388, "num_tokens": 2498832.0, "reward": 0.74432373046875, "reward_std": 0.009785640053451061, "rewards//mean": 0.74432373046875, "rewards//std": 0.03728553652763367, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.058, "grad_norm": 4.903024673461914, "kl": 0.39208444207906723, "learning_rate": 9.942589371799714e-07, "loss": 0.0392, "num_tokens": 2507544.0, "reward": 0.75457763671875, "reward_std": 0.011581134051084518, "rewards//mean": 0.75457763671875, "rewards//std": 0.036779481917619705, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0582, "grad_norm": 4.678225040435791, "kl": 0.7103300355374813, "learning_rate": 9.94210887422681e-07, "loss": 0.071, "num_tokens": 2516200.0, "reward": 0.75054931640625, "reward_std": 0.013709913939237595, "rewards//mean": 0.75054931640625, "rewards//std": 0.035935528576374054, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0584, "grad_norm": 4.853988170623779, "kl": 0.43904567416757345, "learning_rate": 9.941626385973047e-07, "loss": 0.0439, "num_tokens": 2524768.0, "reward": 0.75555419921875, "reward_std": 0.011930609121918678, "rewards//mean": 0.75555419921875, "rewards//std": 0.025956004858016968, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0586, "grad_norm": 5.283975601196289, "kl": 0.34506158623844385, "learning_rate": 9.941141907232763e-07, "loss": 0.0345, "num_tokens": 2533440.0, "reward": 0.753173828125, "reward_std": 0.009904064238071442, "rewards//mean": 0.753173828125, "rewards//std": 0.03344298154115677, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0588, "grad_norm": 5.530971527099609, "kl": 0.8286517476662993, "learning_rate": 9.94065543820111e-07, "loss": 0.0829, "num_tokens": 2542128.0, "reward": 0.77996826171875, "reward_std": 0.01934969238936901, "rewards//mean": 0.77996826171875, "rewards//std": 0.02818603254854679, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.059, "grad_norm": 5.333427429199219, "kl": 1.1374772489070892, "learning_rate": 9.94016697907404e-07, "loss": 0.1137, "num_tokens": 2550680.0, "reward": 0.73394775390625, "reward_std": 0.013834717683494091, "rewards//mean": 0.73394775390625, "rewards//std": 0.031025337055325508, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0592, "grad_norm": 4.085994243621826, "kl": 0.8266544556245208, "learning_rate": 9.9396765300483e-07, "loss": 0.0827, "num_tokens": 2559424.0, "reward": 0.759521484375, "reward_std": 0.012676459737122059, "rewards//mean": 0.759521484375, "rewards//std": 0.039302803575992584, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0594, "grad_norm": 4.988473892211914, "kl": 1.1103221122175455, "learning_rate": 9.939184091321444e-07, "loss": 0.111, "num_tokens": 2568040.0, "reward": 0.7689208984375, "reward_std": 0.015579704195261002, "rewards//mean": 0.7689208984375, "rewards//std": 0.038413625210523605, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0596, "grad_norm": 4.135498046875, "kl": 0.9984888043254614, "learning_rate": 9.938689663091827e-07, "loss": 0.0998, "num_tokens": 2576776.0, "reward": 0.7529296875, "reward_std": 0.01220305822789669, "rewards//mean": 0.7529296875, "rewards//std": 0.041240144520998, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0598, "grad_norm": 4.1136369705200195, "kl": 0.9260764308273792, "learning_rate": 9.938193245558604e-07, "loss": 0.0926, "num_tokens": 2585392.0, "reward": 0.71807861328125, "reward_std": 0.011635358445346355, "rewards//mean": 0.71807861328125, "rewards//std": 0.04143089801073074, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.06, "grad_norm": 5.337719917297363, "kl": 0.7270172508433461, "learning_rate": 9.937694838921733e-07, "loss": 0.0727, "num_tokens": 2594032.0, "reward": 0.7274169921875, "reward_std": 0.009733819402754307, "rewards//mean": 0.7274169921875, "rewards//std": 0.026634516194462776, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0602, "grad_norm": 5.916358470916748, "kl": 0.6537883328273892, "learning_rate": 9.93719444338197e-07, "loss": 0.0654, "num_tokens": 2602736.0, "reward": 0.72100830078125, "reward_std": 0.01079651154577732, "rewards//mean": 0.72100830078125, "rewards//std": 0.047860853374004364, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0604, "grad_norm": 4.627238750457764, "kl": 0.9710520636290312, "learning_rate": 9.936692059140878e-07, "loss": 0.0971, "num_tokens": 2611384.0, "reward": 0.763671875, "reward_std": 0.015687113627791405, "rewards//mean": 0.763671875, "rewards//std": 0.03544783964753151, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0606, "grad_norm": 4.747879505157471, "kl": 0.9860076494514942, "learning_rate": 9.936187686400814e-07, "loss": 0.0986, "num_tokens": 2620152.0, "reward": 0.7462158203125, "reward_std": 0.015207450836896896, "rewards//mean": 0.7462158203125, "rewards//std": 0.040624577552080154, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0608, "grad_norm": 4.995135307312012, "kl": 0.5666801882907748, "learning_rate": 9.93568132536494e-07, "loss": 0.0567, "num_tokens": 2628816.0, "reward": 0.7508544921875, "reward_std": 0.013964075595140457, "rewards//mean": 0.7508544921875, "rewards//std": 0.03339655324816704, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.061, "grad_norm": 4.070781707763672, "kl": 0.782807239331305, "learning_rate": 9.935172976237217e-07, "loss": 0.0783, "num_tokens": 2637496.0, "reward": 0.7484130859375, "reward_std": 0.013928147032856941, "rewards//mean": 0.7484130859375, "rewards//std": 0.0234345942735672, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0612, "grad_norm": 4.972671985626221, "kl": 1.1988671775907278, "learning_rate": 9.93466263922241e-07, "loss": 0.1199, "num_tokens": 2646104.0, "reward": 0.751953125, "reward_std": 0.014233101159334183, "rewards//mean": 0.751953125, "rewards//std": 0.03372514620423317, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0614, "grad_norm": 4.157861709594727, "kl": 0.9355712188407779, "learning_rate": 9.934150314526083e-07, "loss": 0.0936, "num_tokens": 2654744.0, "reward": 0.76129150390625, "reward_std": 0.012417576275765896, "rewards//mean": 0.76129150390625, "rewards//std": 0.017401453107595444, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0616, "grad_norm": 4.780778884887695, "kl": 0.7273468747735023, "learning_rate": 9.933636002354599e-07, "loss": 0.0727, "num_tokens": 2663376.0, "reward": 0.7191162109375, "reward_std": 0.010411866009235382, "rewards//mean": 0.7191162109375, "rewards//std": 0.023475898429751396, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0618, "grad_norm": 5.462563514709473, "kl": 1.1083982829004526, "learning_rate": 9.933119702915124e-07, "loss": 0.1108, "num_tokens": 2671952.0, "reward": 0.7236328125, "reward_std": 0.015762878581881523, "rewards//mean": 0.7236328125, "rewards//std": 0.03274126723408699, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.062, "grad_norm": 4.324944972991943, "kl": 0.9199625449255109, "learning_rate": 9.93260141641562e-07, "loss": 0.092, "num_tokens": 2680624.0, "reward": 0.73858642578125, "reward_std": 0.01111672818660736, "rewards//mean": 0.73858642578125, "rewards//std": 0.03486433997750282, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0622, "grad_norm": 5.655622959136963, "kl": 1.42093366663903, "learning_rate": 9.932081143064858e-07, "loss": 0.1421, "num_tokens": 2689176.0, "reward": 0.76385498046875, "reward_std": 0.018049361184239388, "rewards//mean": 0.76385498046875, "rewards//std": 0.042472273111343384, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0624, "grad_norm": 4.6508097648620605, "kl": 0.7688910737633705, "learning_rate": 9.931558883072402e-07, "loss": 0.0769, "num_tokens": 2697864.0, "reward": 0.75213623046875, "reward_std": 0.013499232940375805, "rewards//mean": 0.75213623046875, "rewards//std": 0.030488377436995506, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0626, "grad_norm": 4.622182846069336, "kl": 0.4912436017766595, "learning_rate": 9.931034636648616e-07, "loss": 0.0491, "num_tokens": 2706480.0, "reward": 0.71551513671875, "reward_std": 0.012210061773657799, "rewards//mean": 0.71551513671875, "rewards//std": 0.021514564752578735, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0628, "grad_norm": 4.870460510253906, "kl": 0.8243315378203988, "learning_rate": 9.930508404004666e-07, "loss": 0.0824, "num_tokens": 2715056.0, "reward": 0.7562255859375, "reward_std": 0.0113151203840971, "rewards//mean": 0.7562255859375, "rewards//std": 0.030510524287819862, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.063, "grad_norm": 4.56314754486084, "kl": 0.5322395460680127, "learning_rate": 9.929980185352525e-07, "loss": 0.0532, "num_tokens": 2723632.0, "reward": 0.7457275390625, "reward_std": 0.008279968984425068, "rewards//mean": 0.7457275390625, "rewards//std": 0.03119351714849472, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0632, "grad_norm": 4.371639728546143, "kl": 0.6331577720120549, "learning_rate": 9.929449980904951e-07, "loss": 0.0633, "num_tokens": 2732264.0, "reward": 0.72454833984375, "reward_std": 0.012990620918571949, "rewards//mean": 0.72454833984375, "rewards//std": 0.03534514456987381, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0634, "grad_norm": 3.9564285278320312, "kl": 1.0012456197291613, "learning_rate": 9.928917790875516e-07, "loss": 0.1001, "num_tokens": 2740960.0, "reward": 0.7568359375, "reward_std": 0.012390851974487305, "rewards//mean": 0.7568359375, "rewards//std": 0.0322268009185791, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0636, "grad_norm": 4.573144912719727, "kl": 0.969984645023942, "learning_rate": 9.928383615478586e-07, "loss": 0.097, "num_tokens": 2749528.0, "reward": 0.74847412109375, "reward_std": 0.010943949222564697, "rewards//mean": 0.74847412109375, "rewards//std": 0.028634140267968178, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0638, "grad_norm": 4.976900100708008, "kl": 0.735985585488379, "learning_rate": 9.927847454929322e-07, "loss": 0.0736, "num_tokens": 2758176.0, "reward": 0.753662109375, "reward_std": 0.01002872921526432, "rewards//mean": 0.753662109375, "rewards//std": 0.02685553953051567, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.064, "grad_norm": 4.124001979827881, "kl": 0.8998236870393157, "learning_rate": 9.927309309443695e-07, "loss": 0.09, "num_tokens": 2766760.0, "reward": 0.76275634765625, "reward_std": 0.013413220643997192, "rewards//mean": 0.76275634765625, "rewards//std": 0.035897597670555115, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0642, "grad_norm": 6.085822582244873, "kl": 1.1758588114753366, "learning_rate": 9.926769179238464e-07, "loss": 0.1176, "num_tokens": 2775344.0, "reward": 0.74359130859375, "reward_std": 0.01115325279533863, "rewards//mean": 0.74359130859375, "rewards//std": 0.038590699434280396, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0644, "grad_norm": 5.7256574630737305, "kl": 0.8206800632178783, "learning_rate": 9.926227064531199e-07, "loss": 0.0821, "num_tokens": 2783880.0, "reward": 0.74560546875, "reward_std": 0.014594745822250843, "rewards//mean": 0.74560546875, "rewards//std": 0.030405409634113312, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0646, "grad_norm": 4.412210941314697, "kl": 1.0538199730217457, "learning_rate": 9.925682965540263e-07, "loss": 0.1054, "num_tokens": 2792480.0, "reward": 0.74951171875, "reward_std": 0.016410717740654945, "rewards//mean": 0.74951171875, "rewards//std": 0.036288533359766006, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0648, "grad_norm": 6.5532002449035645, "kl": 0.456143987365067, "learning_rate": 9.925136882484815e-07, "loss": 0.0456, "num_tokens": 2801088.0, "reward": 0.7254638671875, "reward_std": 0.00946881715208292, "rewards//mean": 0.7254638671875, "rewards//std": 0.033526841551065445, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.065, "grad_norm": 7.161509990692139, "kl": 1.4719884041696787, "learning_rate": 9.92458881558482e-07, "loss": 0.1472, "num_tokens": 2809768.0, "reward": 0.74249267578125, "reward_std": 0.012742443010210991, "rewards//mean": 0.74249267578125, "rewards//std": 0.03293528035283089, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0652, "grad_norm": 4.279499530792236, "kl": 1.221362279728055, "learning_rate": 9.92403876506104e-07, "loss": 0.1221, "num_tokens": 2818328.0, "reward": 0.75152587890625, "reward_std": 0.015956521034240723, "rewards//mean": 0.75152587890625, "rewards//std": 0.03385858237743378, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0654, "grad_norm": 5.586702823638916, "kl": 1.3700648723170161, "learning_rate": 9.923486731135033e-07, "loss": 0.137, "num_tokens": 2826984.0, "reward": 0.7186279296875, "reward_std": 0.020838936790823936, "rewards//mean": 0.7186279296875, "rewards//std": 0.059400323778390884, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0656, "grad_norm": 4.409854888916016, "kl": 1.432421556673944, "learning_rate": 9.922932714029163e-07, "loss": 0.1432, "num_tokens": 2835544.0, "reward": 0.75897216796875, "reward_std": 0.014281929470598698, "rewards//mean": 0.75897216796875, "rewards//std": 0.028811747208237648, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0658, "grad_norm": 6.012087821960449, "kl": 0.9793825000524521, "learning_rate": 9.92237671396658e-07, "loss": 0.0979, "num_tokens": 2844184.0, "reward": 0.76617431640625, "reward_std": 0.01349995844066143, "rewards//mean": 0.76617431640625, "rewards//std": 0.028517598286271095, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.066, "grad_norm": 4.234142780303955, "kl": 0.9242843044921756, "learning_rate": 9.921818731171248e-07, "loss": 0.0924, "num_tokens": 2852824.0, "reward": 0.72479248046875, "reward_std": 0.015175123699009418, "rewards//mean": 0.72479248046875, "rewards//std": 0.026584099978208542, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0662, "grad_norm": 4.870720386505127, "kl": 1.1872072061523795, "learning_rate": 9.921258765867919e-07, "loss": 0.1187, "num_tokens": 2861568.0, "reward": 0.755126953125, "reward_std": 0.013591473922133446, "rewards//mean": 0.755126953125, "rewards//std": 0.0360499769449234, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0664, "grad_norm": 5.395249366760254, "kl": 0.9591629793867469, "learning_rate": 9.920696818282147e-07, "loss": 0.0959, "num_tokens": 2870168.0, "reward": 0.7657470703125, "reward_std": 0.014035122469067574, "rewards//mean": 0.7657470703125, "rewards//std": 0.029528219252824783, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0666, "grad_norm": 4.52071475982666, "kl": 1.1475577987730503, "learning_rate": 9.920132888640284e-07, "loss": 0.1148, "num_tokens": 2878752.0, "reward": 0.72406005859375, "reward_std": 0.013598522171378136, "rewards//mean": 0.72406005859375, "rewards//std": 0.03329412639141083, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0668, "grad_norm": 5.5705790519714355, "kl": 1.3797461157664657, "learning_rate": 9.919566977169485e-07, "loss": 0.138, "num_tokens": 2887504.0, "reward": 0.7239990234375, "reward_std": 0.016014471650123596, "rewards//mean": 0.7239990234375, "rewards//std": 0.03900180757045746, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.067, "grad_norm": 4.385342597961426, "kl": 1.0792835243046284, "learning_rate": 9.918999084097694e-07, "loss": 0.1079, "num_tokens": 2896176.0, "reward": 0.70703125, "reward_std": 0.01298388373106718, "rewards//mean": 0.70703125, "rewards//std": 0.02993578463792801, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0672, "grad_norm": 5.2752838134765625, "kl": 1.3719432950019836, "learning_rate": 9.91842920965366e-07, "loss": 0.1372, "num_tokens": 2904808.0, "reward": 0.7362060546875, "reward_std": 0.014960775151848793, "rewards//mean": 0.7362060546875, "rewards//std": 0.03218143805861473, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0674, "grad_norm": 4.5603251457214355, "kl": 0.9280480965971947, "learning_rate": 9.91785735406693e-07, "loss": 0.0928, "num_tokens": 2913544.0, "reward": 0.723876953125, "reward_std": 0.01555250771343708, "rewards//mean": 0.723876953125, "rewards//std": 0.037711478769779205, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0676, "grad_norm": 5.770708084106445, "kl": 0.7606811327859759, "learning_rate": 9.917283517567843e-07, "loss": 0.0761, "num_tokens": 2922208.0, "reward": 0.7403564453125, "reward_std": 0.013976898044347763, "rewards//mean": 0.7403564453125, "rewards//std": 0.028704695403575897, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0678, "grad_norm": 4.471171855926514, "kl": 1.3800368467345834, "learning_rate": 9.916707700387545e-07, "loss": 0.138, "num_tokens": 2930904.0, "reward": 0.73065185546875, "reward_std": 0.01717003807425499, "rewards//mean": 0.73065185546875, "rewards//std": 0.038735173642635345, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.068, "grad_norm": 4.665797233581543, "kl": 1.1380757903680205, "learning_rate": 9.916129902757974e-07, "loss": 0.1138, "num_tokens": 2939600.0, "reward": 0.75115966796875, "reward_std": 0.010834988206624985, "rewards//mean": 0.75115966796875, "rewards//std": 0.0265179630368948, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0682, "grad_norm": 7.105188846588135, "kl": 0.8124034851789474, "learning_rate": 9.915550124911866e-07, "loss": 0.0812, "num_tokens": 2948432.0, "reward": 0.7431640625, "reward_std": 0.012510347180068493, "rewards//mean": 0.7431640625, "rewards//std": 0.025976480916142464, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0684, "grad_norm": 6.090428352355957, "kl": 1.9907669760286808, "learning_rate": 9.914968367082755e-07, "loss": 0.1991, "num_tokens": 2957032.0, "reward": 0.76763916015625, "reward_std": 0.016988936811685562, "rewards//mean": 0.76763916015625, "rewards//std": 0.028450636193156242, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0686, "grad_norm": 4.496386528015137, "kl": 1.0567503031343222, "learning_rate": 9.914384629504973e-07, "loss": 0.1057, "num_tokens": 2965680.0, "reward": 0.7489013671875, "reward_std": 0.016614051535725594, "rewards//mean": 0.7489013671875, "rewards//std": 0.038578782230615616, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0688, "grad_norm": 5.6869330406188965, "kl": 1.4211051985621452, "learning_rate": 9.913798912413652e-07, "loss": 0.1421, "num_tokens": 2974304.0, "reward": 0.701171875, "reward_std": 0.014944510534405708, "rewards//mean": 0.701171875, "rewards//std": 0.044792965054512024, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.069, "grad_norm": 6.705982208251953, "kl": 1.7800831086933613, "learning_rate": 9.913211216044713e-07, "loss": 0.178, "num_tokens": 2982920.0, "reward": 0.7130126953125, "reward_std": 0.013807592913508415, "rewards//mean": 0.7130126953125, "rewards//std": 0.044018279761075974, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0692, "grad_norm": 5.911066055297852, "kl": 0.8889193050563335, "learning_rate": 9.912621540634886e-07, "loss": 0.0889, "num_tokens": 2991640.0, "reward": 0.7552490234375, "reward_std": 0.012438319623470306, "rewards//mean": 0.7552490234375, "rewards//std": 0.027963140979409218, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0694, "grad_norm": 4.615927219390869, "kl": 1.5478127505630255, "learning_rate": 9.91202988642169e-07, "loss": 0.1548, "num_tokens": 3000224.0, "reward": 0.74017333984375, "reward_std": 0.02050355263054371, "rewards//mean": 0.74017333984375, "rewards//std": 0.04133981838822365, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0696, "grad_norm": 4.6172404289245605, "kl": 1.6639066198840737, "learning_rate": 9.911436253643443e-07, "loss": 0.1664, "num_tokens": 3008880.0, "reward": 0.727783203125, "reward_std": 0.01631959155201912, "rewards//mean": 0.727783203125, "rewards//std": 0.034024547785520554, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0698, "grad_norm": 7.348176002502441, "kl": 1.608157278969884, "learning_rate": 9.91084064253926e-07, "loss": 0.1608, "num_tokens": 3017536.0, "reward": 0.72088623046875, "reward_std": 0.012965100817382336, "rewards//mean": 0.72088623046875, "rewards//std": 0.03418118134140968, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.07, "grad_norm": 11.13676929473877, "kl": 2.0739471651613712, "learning_rate": 9.910243053349055e-07, "loss": 0.2074, "num_tokens": 3026160.0, "reward": 0.75830078125, "reward_std": 0.015678374096751213, "rewards//mean": 0.75830078125, "rewards//std": 0.033886346966028214, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0702, "grad_norm": 4.341561317443848, "kl": 0.7875896524637938, "learning_rate": 9.909643486313533e-07, "loss": 0.0788, "num_tokens": 3034792.0, "reward": 0.77484130859375, "reward_std": 0.012139599770307541, "rewards//mean": 0.77484130859375, "rewards//std": 0.025107571855187416, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0704, "grad_norm": 34.89895248413086, "kl": 1.1917068948969245, "learning_rate": 9.909041941674204e-07, "loss": 0.1192, "num_tokens": 3043432.0, "reward": 0.7481689453125, "reward_std": 0.014628879725933075, "rewards//mean": 0.7481689453125, "rewards//std": 0.029618307948112488, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0706, "grad_norm": 3.9406888484954834, "kl": 1.3195801004767418, "learning_rate": 9.908438419673366e-07, "loss": 0.132, "num_tokens": 3052008.0, "reward": 0.75390625, "reward_std": 0.01344769075512886, "rewards//mean": 0.75390625, "rewards//std": 0.02626393362879753, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0708, "grad_norm": 5.148041248321533, "kl": 1.028293943963945, "learning_rate": 9.90783292055412e-07, "loss": 0.1028, "num_tokens": 3060680.0, "reward": 0.76177978515625, "reward_std": 0.012960381805896759, "rewards//mean": 0.76177978515625, "rewards//std": 0.031191756948828697, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.071, "grad_norm": 6.117336273193359, "kl": 1.357841451652348, "learning_rate": 9.907225444560361e-07, "loss": 0.1358, "num_tokens": 3069312.0, "reward": 0.761474609375, "reward_std": 0.019020909443497658, "rewards//mean": 0.761474609375, "rewards//std": 0.03357308730483055, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0712, "grad_norm": 4.669897556304932, "kl": 0.9688333803787827, "learning_rate": 9.90661599193678e-07, "loss": 0.0969, "num_tokens": 3078032.0, "reward": 0.74676513671875, "reward_std": 0.014538027346134186, "rewards//mean": 0.74676513671875, "rewards//std": 0.02792271412909031, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0714, "grad_norm": 4.874142169952393, "kl": 1.1960312463343143, "learning_rate": 9.906004562928863e-07, "loss": 0.1196, "num_tokens": 3086656.0, "reward": 0.7301025390625, "reward_std": 0.01636132039129734, "rewards//mean": 0.7301025390625, "rewards//std": 0.025767896324396133, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0716, "grad_norm": 4.525518894195557, "kl": 0.942520503886044, "learning_rate": 9.905391157782897e-07, "loss": 0.0943, "num_tokens": 3095184.0, "reward": 0.74371337890625, "reward_std": 0.013057741336524487, "rewards//mean": 0.74371337890625, "rewards//std": 0.03897321969270706, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0718, "grad_norm": 4.909975528717041, "kl": 0.9185402356088161, "learning_rate": 9.904775776745956e-07, "loss": 0.0919, "num_tokens": 3103768.0, "reward": 0.77825927734375, "reward_std": 0.012431308627128601, "rewards//mean": 0.77825927734375, "rewards//std": 0.034370649605989456, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.072, "grad_norm": 6.116695880889893, "kl": 0.7805115794762969, "learning_rate": 9.904158420065922e-07, "loss": 0.0781, "num_tokens": 3112464.0, "reward": 0.76300048828125, "reward_std": 0.012985588982701302, "rewards//mean": 0.76300048828125, "rewards//std": 0.03321173042058945, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0722, "grad_norm": 4.855072021484375, "kl": 1.4850094048306346, "learning_rate": 9.903539087991461e-07, "loss": 0.1485, "num_tokens": 3121000.0, "reward": 0.70574951171875, "reward_std": 0.013892744667828083, "rewards//mean": 0.70574951171875, "rewards//std": 0.045972708612680435, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0724, "grad_norm": 4.527280330657959, "kl": 1.5802220031619072, "learning_rate": 9.902917780772042e-07, "loss": 0.158, "num_tokens": 3129608.0, "reward": 0.7613525390625, "reward_std": 0.019554516300559044, "rewards//mean": 0.7613525390625, "rewards//std": 0.03348527476191521, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0726, "grad_norm": 4.968642234802246, "kl": 1.7012907210737467, "learning_rate": 9.902294498657929e-07, "loss": 0.1701, "num_tokens": 3138360.0, "reward": 0.74285888671875, "reward_std": 0.015753204002976418, "rewards//mean": 0.74285888671875, "rewards//std": 0.04993502423167229, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0728, "grad_norm": 5.95297384262085, "kl": 1.2917128959670663, "learning_rate": 9.901669241900176e-07, "loss": 0.1292, "num_tokens": 3146912.0, "reward": 0.73187255859375, "reward_std": 0.012270906008780003, "rewards//mean": 0.73187255859375, "rewards//std": 0.035742923617362976, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.073, "grad_norm": 6.7177581787109375, "kl": 1.2866203812882304, "learning_rate": 9.90104201075064e-07, "loss": 0.1287, "num_tokens": 3155568.0, "reward": 0.712890625, "reward_std": 0.01278286799788475, "rewards//mean": 0.712890625, "rewards//std": 0.04508937895298004, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0732, "grad_norm": 4.225528717041016, "kl": 1.4042665641754866, "learning_rate": 9.900412805461966e-07, "loss": 0.1404, "num_tokens": 3164336.0, "reward": 0.75274658203125, "reward_std": 0.014368729665875435, "rewards//mean": 0.75274658203125, "rewards//std": 0.040323637425899506, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0734, "grad_norm": 6.365389823913574, "kl": 2.0204008100554347, "learning_rate": 9.899781626287602e-07, "loss": 0.202, "num_tokens": 3173144.0, "reward": 0.7510986328125, "reward_std": 0.012239251285791397, "rewards//mean": 0.7510986328125, "rewards//std": 0.0335485078394413, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0736, "grad_norm": 3.669586420059204, "kl": 0.9549193810671568, "learning_rate": 9.899148473481784e-07, "loss": 0.0955, "num_tokens": 3181768.0, "reward": 0.7335205078125, "reward_std": 0.010605204850435257, "rewards//mean": 0.7335205078125, "rewards//std": 0.03359900414943695, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0738, "grad_norm": 5.0829691886901855, "kl": 1.339660044759512, "learning_rate": 9.898513347299547e-07, "loss": 0.134, "num_tokens": 3190400.0, "reward": 0.70855712890625, "reward_std": 0.01506077405065298, "rewards//mean": 0.70855712890625, "rewards//std": 0.04746353626251221, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.074, "grad_norm": 4.2541422843933105, "kl": 0.9790180828422308, "learning_rate": 9.89787624799672e-07, "loss": 0.0979, "num_tokens": 3199032.0, "reward": 0.76324462890625, "reward_std": 0.011934377253055573, "rewards//mean": 0.76324462890625, "rewards//std": 0.03299635276198387, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0742, "grad_norm": 4.490905284881592, "kl": 1.3595623094588518, "learning_rate": 9.897237175829926e-07, "loss": 0.136, "num_tokens": 3207600.0, "reward": 0.74859619140625, "reward_std": 0.016955040395259857, "rewards//mean": 0.74859619140625, "rewards//std": 0.03239446505904198, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0744, "grad_norm": 6.865668773651123, "kl": 1.6440453492105007, "learning_rate": 9.896596131056582e-07, "loss": 0.1644, "num_tokens": 3216240.0, "reward": 0.73419189453125, "reward_std": 0.012222332879900932, "rewards//mean": 0.73419189453125, "rewards//std": 0.037753552198410034, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0746, "grad_norm": 4.67140007019043, "kl": 1.8705146927386522, "learning_rate": 9.895953113934903e-07, "loss": 0.1871, "num_tokens": 3224856.0, "reward": 0.77838134765625, "reward_std": 0.022047007456421852, "rewards//mean": 0.77838134765625, "rewards//std": 0.035951532423496246, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0748, "grad_norm": 4.3647847175598145, "kl": 1.3531591948121786, "learning_rate": 9.895308124723896e-07, "loss": 0.1353, "num_tokens": 3233608.0, "reward": 0.76177978515625, "reward_std": 0.016040973365306854, "rewards//mean": 0.76177978515625, "rewards//std": 0.037015385925769806, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.075, "grad_norm": 7.428527355194092, "kl": 1.886878363788128, "learning_rate": 9.89466116368336e-07, "loss": 0.1887, "num_tokens": 3242224.0, "reward": 0.714111328125, "reward_std": 0.013848571106791496, "rewards//mean": 0.714111328125, "rewards//std": 0.051243580877780914, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0752, "grad_norm": 5.991532802581787, "kl": 1.8202048065140843, "learning_rate": 9.894012231073895e-07, "loss": 0.182, "num_tokens": 3250880.0, "reward": 0.74603271484375, "reward_std": 0.016969073563814163, "rewards//mean": 0.74603271484375, "rewards//std": 0.04127238690853119, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0754, "grad_norm": 4.161983013153076, "kl": 1.292853050865233, "learning_rate": 9.893361327156884e-07, "loss": 0.1293, "num_tokens": 3259592.0, "reward": 0.74908447265625, "reward_std": 0.01443011499941349, "rewards//mean": 0.74908447265625, "rewards//std": 0.037323277443647385, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0756, "grad_norm": 4.299761772155762, "kl": 1.0473097246140242, "learning_rate": 9.89270845219452e-07, "loss": 0.1047, "num_tokens": 3268216.0, "reward": 0.76910400390625, "reward_std": 0.018593017011880875, "rewards//mean": 0.76910400390625, "rewards//std": 0.04182399809360504, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0758, "grad_norm": 6.520134925842285, "kl": 1.0033248355612159, "learning_rate": 9.892053606449774e-07, "loss": 0.1003, "num_tokens": 3276832.0, "reward": 0.7694091796875, "reward_std": 0.01282799057662487, "rewards//mean": 0.7694091796875, "rewards//std": 0.03313809260725975, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.076, "grad_norm": 4.847512245178223, "kl": 1.791846677660942, "learning_rate": 9.891396790186422e-07, "loss": 0.1792, "num_tokens": 3285440.0, "reward": 0.7230224609375, "reward_std": 0.014002474956214428, "rewards//mean": 0.7230224609375, "rewards//std": 0.0384167805314064, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0762, "grad_norm": 6.757416725158691, "kl": 1.1047533452510834, "learning_rate": 9.890738003669027e-07, "loss": 0.1105, "num_tokens": 3294056.0, "reward": 0.76422119140625, "reward_std": 0.012958530336618423, "rewards//mean": 0.76422119140625, "rewards//std": 0.04590053856372833, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0764, "grad_norm": 6.155483245849609, "kl": 1.2366701336577535, "learning_rate": 9.89007724716295e-07, "loss": 0.1237, "num_tokens": 3302832.0, "reward": 0.7449951171875, "reward_std": 0.011507006362080574, "rewards//mean": 0.7449951171875, "rewards//std": 0.03625493869185448, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0766, "grad_norm": 4.060118675231934, "kl": 1.4494256637990475, "learning_rate": 9.889414520934343e-07, "loss": 0.1449, "num_tokens": 3311488.0, "reward": 0.74951171875, "reward_std": 0.0157524012029171, "rewards//mean": 0.74951171875, "rewards//std": 0.02822420373558998, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0768, "grad_norm": 5.359792232513428, "kl": 1.7442060075700283, "learning_rate": 9.88874982525015e-07, "loss": 0.1744, "num_tokens": 3320128.0, "reward": 0.71612548828125, "reward_std": 0.018574664369225502, "rewards//mean": 0.71612548828125, "rewards//std": 0.050058554857969284, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.077, "grad_norm": 4.171280860900879, "kl": 0.897304117679596, "learning_rate": 9.888083160378112e-07, "loss": 0.0897, "num_tokens": 3328744.0, "reward": 0.73583984375, "reward_std": 0.010158386081457138, "rewards//mean": 0.73583984375, "rewards//std": 0.02818985842168331, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0772, "grad_norm": 4.775302886962891, "kl": 1.5411655902862549, "learning_rate": 9.887414526586763e-07, "loss": 0.1541, "num_tokens": 3337352.0, "reward": 0.73626708984375, "reward_std": 0.014250718057155609, "rewards//mean": 0.73626708984375, "rewards//std": 0.034910768270492554, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0774, "grad_norm": 4.278979301452637, "kl": 1.5506847016513348, "learning_rate": 9.886743924145426e-07, "loss": 0.1551, "num_tokens": 3345952.0, "reward": 0.77008056640625, "reward_std": 0.0142056904733181, "rewards//mean": 0.77008056640625, "rewards//std": 0.03488908335566521, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0776, "grad_norm": 5.347935676574707, "kl": 1.5720333755016327, "learning_rate": 9.886071353324222e-07, "loss": 0.1572, "num_tokens": 3354552.0, "reward": 0.72705078125, "reward_std": 0.014094813726842403, "rewards//mean": 0.72705078125, "rewards//std": 0.03180694952607155, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0778, "grad_norm": 5.136808395385742, "kl": 1.092930156737566, "learning_rate": 9.88539681439406e-07, "loss": 0.1093, "num_tokens": 3363192.0, "reward": 0.73883056640625, "reward_std": 0.015286346897482872, "rewards//mean": 0.73883056640625, "rewards//std": 0.0341452918946743, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.078, "grad_norm": 6.12586784362793, "kl": 1.5505319805815816, "learning_rate": 9.884720307626646e-07, "loss": 0.1551, "num_tokens": 3371832.0, "reward": 0.741455078125, "reward_std": 0.015485553070902824, "rewards//mean": 0.741455078125, "rewards//std": 0.04268811270594597, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0782, "grad_norm": 3.5782203674316406, "kl": 1.7768533248454332, "learning_rate": 9.884041833294475e-07, "loss": 0.1777, "num_tokens": 3380448.0, "reward": 0.76104736328125, "reward_std": 0.022384842857718468, "rewards//mean": 0.76104736328125, "rewards//std": 0.03959553688764572, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0784, "grad_norm": 7.191686153411865, "kl": 1.9574782270938158, "learning_rate": 9.883361391670839e-07, "loss": 0.1957, "num_tokens": 3389040.0, "reward": 0.74920654296875, "reward_std": 0.01802128367125988, "rewards//mean": 0.74920654296875, "rewards//std": 0.03243042528629303, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0786, "grad_norm": 3.984739303588867, "kl": 2.038462040014565, "learning_rate": 9.882678983029817e-07, "loss": 0.2038, "num_tokens": 3397736.0, "reward": 0.73284912109375, "reward_std": 0.018375638872385025, "rewards//mean": 0.73284912109375, "rewards//std": 0.03866124153137207, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0788, "grad_norm": 5.110982894897461, "kl": 1.4865666590631008, "learning_rate": 9.881994607646286e-07, "loss": 0.1487, "num_tokens": 3406328.0, "reward": 0.74188232421875, "reward_std": 0.010195759125053883, "rewards//mean": 0.74188232421875, "rewards//std": 0.016952887177467346, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.079, "grad_norm": 7.007421016693115, "kl": 2.300824441947043, "learning_rate": 9.881308265795911e-07, "loss": 0.2301, "num_tokens": 3414960.0, "reward": 0.7236328125, "reward_std": 0.01930246688425541, "rewards//mean": 0.7236328125, "rewards//std": 0.04316278174519539, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0792, "grad_norm": 4.596628665924072, "kl": 1.854134103283286, "learning_rate": 9.88061995775515e-07, "loss": 0.1854, "num_tokens": 3423696.0, "reward": 0.7176513671875, "reward_std": 0.014456256292760372, "rewards//mean": 0.7176513671875, "rewards//std": 0.042821768671274185, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0794, "grad_norm": 4.418922424316406, "kl": 1.2172086499631405, "learning_rate": 9.879929683801253e-07, "loss": 0.1217, "num_tokens": 3432248.0, "reward": 0.75030517578125, "reward_std": 0.014133303426206112, "rewards//mean": 0.75030517578125, "rewards//std": 0.03540205955505371, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0796, "grad_norm": 5.769720077514648, "kl": 1.0407563131302595, "learning_rate": 9.879237444212264e-07, "loss": 0.1041, "num_tokens": 3440928.0, "reward": 0.75732421875, "reward_std": 0.01361837238073349, "rewards//mean": 0.75732421875, "rewards//std": 0.026960179209709167, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0798, "grad_norm": 7.309990882873535, "kl": 2.549136446788907, "learning_rate": 9.878543239267014e-07, "loss": 0.2549, "num_tokens": 3449560.0, "reward": 0.73712158203125, "reward_std": 0.01825561933219433, "rewards//mean": 0.73712158203125, "rewards//std": 0.03929543495178223, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.08, "grad_norm": 3.9757120609283447, "kl": 1.892622048035264, "learning_rate": 9.877847069245133e-07, "loss": 0.1893, "num_tokens": 3458264.0, "reward": 0.76141357421875, "reward_std": 0.020533859729766846, "rewards//mean": 0.76141357421875, "rewards//std": 0.03574249893426895, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0802, "grad_norm": 8.19530200958252, "kl": 2.073295334354043, "learning_rate": 9.877148934427035e-07, "loss": 0.2073, "num_tokens": 3466896.0, "reward": 0.76507568359375, "reward_std": 0.015231217257678509, "rewards//mean": 0.76507568359375, "rewards//std": 0.03705707564949989, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0804, "grad_norm": 8.765624046325684, "kl": 1.6512914411723614, "learning_rate": 9.876448835093929e-07, "loss": 0.1651, "num_tokens": 3475584.0, "reward": 0.758544921875, "reward_std": 0.012856746092438698, "rewards//mean": 0.758544921875, "rewards//std": 0.03643092140555382, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0806, "grad_norm": 6.105319976806641, "kl": 1.8445893814787269, "learning_rate": 9.875746771527815e-07, "loss": 0.1845, "num_tokens": 3484184.0, "reward": 0.7689208984375, "reward_std": 0.01619412750005722, "rewards//mean": 0.7689208984375, "rewards//std": 0.0371377170085907, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0808, "grad_norm": 4.7534074783325195, "kl": 1.8480807561427355, "learning_rate": 9.875042744011486e-07, "loss": 0.1848, "num_tokens": 3492760.0, "reward": 0.76025390625, "reward_std": 0.015435409732162952, "rewards//mean": 0.76025390625, "rewards//std": 0.02669837884604931, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.081, "grad_norm": 8.251901626586914, "kl": 2.2260224148631096, "learning_rate": 9.874336752828522e-07, "loss": 0.2226, "num_tokens": 3501344.0, "reward": 0.74627685546875, "reward_std": 0.020439930260181427, "rewards//mean": 0.74627685546875, "rewards//std": 0.03999646008014679, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0812, "grad_norm": 9.156527519226074, "kl": 2.4917550943791866, "learning_rate": 9.873628798263295e-07, "loss": 0.2492, "num_tokens": 3510008.0, "reward": 0.74578857421875, "reward_std": 0.02122437208890915, "rewards//mean": 0.74578857421875, "rewards//std": 0.04651349037885666, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0814, "grad_norm": 5.503023147583008, "kl": 1.6605557333678007, "learning_rate": 9.872918880600973e-07, "loss": 0.1661, "num_tokens": 3518752.0, "reward": 0.7576904296875, "reward_std": 0.013971546664834023, "rewards//mean": 0.7576904296875, "rewards//std": 0.02513265423476696, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0816, "grad_norm": 7.537412166595459, "kl": 1.1888005854561925, "learning_rate": 9.87220700012751e-07, "loss": 0.1189, "num_tokens": 3527400.0, "reward": 0.75775146484375, "reward_std": 0.015135394409298897, "rewards//mean": 0.75775146484375, "rewards//std": 0.04103698208928108, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0818, "grad_norm": 6.6156439781188965, "kl": 2.0969483722001314, "learning_rate": 9.871493157129647e-07, "loss": 0.2097, "num_tokens": 3536064.0, "reward": 0.73748779296875, "reward_std": 0.017476029694080353, "rewards//mean": 0.73748779296875, "rewards//std": 0.03361133486032486, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.082, "grad_norm": 7.873403072357178, "kl": 2.726026590913534, "learning_rate": 9.870777351894926e-07, "loss": 0.2726, "num_tokens": 3544736.0, "reward": 0.70843505859375, "reward_std": 0.01589493826031685, "rewards//mean": 0.70843505859375, "rewards//std": 0.03425506129860878, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0822, "grad_norm": 12.662691116333008, "kl": 2.877569567412138, "learning_rate": 9.870059584711668e-07, "loss": 0.2878, "num_tokens": 3553424.0, "reward": 0.72467041015625, "reward_std": 0.01846178248524666, "rewards//mean": 0.72467041015625, "rewards//std": 0.041550204157829285, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0824, "grad_norm": 8.252856254577637, "kl": 2.546204186975956, "learning_rate": 9.869339855868991e-07, "loss": 0.2546, "num_tokens": 3562040.0, "reward": 0.71368408203125, "reward_std": 0.017232369631528854, "rewards//mean": 0.71368408203125, "rewards//std": 0.039239924401044846, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0826, "grad_norm": 8.939577102661133, "kl": 2.673120856285095, "learning_rate": 9.868618165656804e-07, "loss": 0.2673, "num_tokens": 3570696.0, "reward": 0.73443603515625, "reward_std": 0.01580619066953659, "rewards//mean": 0.73443603515625, "rewards//std": 0.03585202246904373, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0828, "grad_norm": 8.235445976257324, "kl": 1.9868406672030687, "learning_rate": 9.8678945143658e-07, "loss": 0.1987, "num_tokens": 3579256.0, "reward": 0.721435546875, "reward_std": 0.012468339875340462, "rewards//mean": 0.721435546875, "rewards//std": 0.03478484973311424, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.083, "grad_norm": 9.347323417663574, "kl": 2.8131296895444393, "learning_rate": 9.86716890228747e-07, "loss": 0.2813, "num_tokens": 3587928.0, "reward": 0.707275390625, "reward_std": 0.016024313867092133, "rewards//mean": 0.707275390625, "rewards//std": 0.04698273912072182, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0832, "grad_norm": 4.233506202697754, "kl": 1.4603004241362214, "learning_rate": 9.866441329714087e-07, "loss": 0.146, "num_tokens": 3596568.0, "reward": 0.74725341796875, "reward_std": 0.015837572515010834, "rewards//mean": 0.74725341796875, "rewards//std": 0.026224061846733093, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0834, "grad_norm": 4.584789752960205, "kl": 2.1177571155130863, "learning_rate": 9.86571179693872e-07, "loss": 0.2118, "num_tokens": 3605248.0, "reward": 0.75885009765625, "reward_std": 0.020106647163629532, "rewards//mean": 0.75885009765625, "rewards//std": 0.035422153770923615, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0836, "grad_norm": 12.561141967773438, "kl": 2.6870868876576424, "learning_rate": 9.86498030425522e-07, "loss": 0.2687, "num_tokens": 3613840.0, "reward": 0.73394775390625, "reward_std": 0.022721827030181885, "rewards//mean": 0.73394775390625, "rewards//std": 0.04849086329340935, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0838, "grad_norm": 6.3230881690979, "kl": 2.121968600898981, "learning_rate": 9.864246851958237e-07, "loss": 0.2122, "num_tokens": 3622464.0, "reward": 0.744384765625, "reward_std": 0.01947307400405407, "rewards//mean": 0.744384765625, "rewards//std": 0.04458090662956238, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.084, "grad_norm": 5.679402828216553, "kl": 1.2928848881274462, "learning_rate": 9.863511440343205e-07, "loss": 0.1293, "num_tokens": 3631032.0, "reward": 0.73162841796875, "reward_std": 0.016452208161354065, "rewards//mean": 0.73162841796875, "rewards//std": 0.037684522569179535, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0842, "grad_norm": 7.331523418426514, "kl": 2.0377153968438506, "learning_rate": 9.862774069706345e-07, "loss": 0.2038, "num_tokens": 3639672.0, "reward": 0.72991943359375, "reward_std": 0.016138438135385513, "rewards//mean": 0.72991943359375, "rewards//std": 0.04227185249328613, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0844, "grad_norm": 5.33541202545166, "kl": 1.6475609578192234, "learning_rate": 9.862034740344671e-07, "loss": 0.1648, "num_tokens": 3648336.0, "reward": 0.758056640625, "reward_std": 0.020520765334367752, "rewards//mean": 0.758056640625, "rewards//std": 0.037414874881505966, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0846, "grad_norm": 4.5698628425598145, "kl": 1.597356203943491, "learning_rate": 9.861293452555986e-07, "loss": 0.1597, "num_tokens": 3656896.0, "reward": 0.77569580078125, "reward_std": 0.02331678941845894, "rewards//mean": 0.77569580078125, "rewards//std": 0.04867874085903168, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0848, "grad_norm": 7.015743732452393, "kl": 1.2671599444001913, "learning_rate": 9.86055020663888e-07, "loss": 0.1267, "num_tokens": 3665504.0, "reward": 0.72161865234375, "reward_std": 0.01877153106033802, "rewards//mean": 0.72161865234375, "rewards//std": 0.027535663917660713, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.085, "grad_norm": 5.365900993347168, "kl": 1.874972129240632, "learning_rate": 9.859805002892731e-07, "loss": 0.1875, "num_tokens": 3674080.0, "reward": 0.71783447265625, "reward_std": 0.015492199920117855, "rewards//mean": 0.71783447265625, "rewards//std": 0.04610784351825714, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0852, "grad_norm": 4.0368499755859375, "kl": 1.5739709567278624, "learning_rate": 9.859057841617708e-07, "loss": 0.1574, "num_tokens": 3682640.0, "reward": 0.735595703125, "reward_std": 0.01890283264219761, "rewards//mean": 0.735595703125, "rewards//std": 0.04131568968296051, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0854, "grad_norm": 3.79864764213562, "kl": 1.468534404411912, "learning_rate": 9.858308723114768e-07, "loss": 0.1469, "num_tokens": 3691304.0, "reward": 0.7694091796875, "reward_std": 0.01469217799603939, "rewards//mean": 0.7694091796875, "rewards//std": 0.032962214201688766, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0856, "grad_norm": 4.733910083770752, "kl": 1.266563430428505, "learning_rate": 9.857557647685655e-07, "loss": 0.1267, "num_tokens": 3700024.0, "reward": 0.74017333984375, "reward_std": 0.013550288043916225, "rewards//mean": 0.74017333984375, "rewards//std": 0.04085064306855202, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0858, "grad_norm": 5.201381683349609, "kl": 1.3382756058126688, "learning_rate": 9.856804615632901e-07, "loss": 0.1338, "num_tokens": 3708784.0, "reward": 0.73822021484375, "reward_std": 0.015281138941645622, "rewards//mean": 0.73822021484375, "rewards//std": 0.04256092756986618, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.086, "grad_norm": 4.525814056396484, "kl": 1.0764458682388067, "learning_rate": 9.856049627259832e-07, "loss": 0.1076, "num_tokens": 3717392.0, "reward": 0.76513671875, "reward_std": 0.0113562922924757, "rewards//mean": 0.76513671875, "rewards//std": 0.030325647443532944, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0862, "grad_norm": 3.9547278881073, "kl": 1.5455809328705072, "learning_rate": 9.85529268287055e-07, "loss": 0.1546, "num_tokens": 3726008.0, "reward": 0.7393798828125, "reward_std": 0.016390426084399223, "rewards//mean": 0.7393798828125, "rewards//std": 0.033620622009038925, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0864, "grad_norm": 6.644905090332031, "kl": 1.1177070429548621, "learning_rate": 9.854533782769959e-07, "loss": 0.1118, "num_tokens": 3734584.0, "reward": 0.73309326171875, "reward_std": 0.019686318933963776, "rewards//mean": 0.73309326171875, "rewards//std": 0.03657808154821396, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0866, "grad_norm": 7.502194881439209, "kl": 1.0910164881497622, "learning_rate": 9.853772927263739e-07, "loss": 0.1091, "num_tokens": 3743296.0, "reward": 0.77508544921875, "reward_std": 0.02143547311425209, "rewards//mean": 0.77508544921875, "rewards//std": 0.042630936950445175, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0868, "grad_norm": 5.1703057289123535, "kl": 1.7078630905598402, "learning_rate": 9.853010116658366e-07, "loss": 0.1708, "num_tokens": 3752000.0, "reward": 0.73388671875, "reward_std": 0.01481359638273716, "rewards//mean": 0.73388671875, "rewards//std": 0.03354873135685921, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.087, "grad_norm": 5.223759651184082, "kl": 1.200061284005642, "learning_rate": 9.852245351261097e-07, "loss": 0.12, "num_tokens": 3760584.0, "reward": 0.71185302734375, "reward_std": 0.013629972003400326, "rewards//mean": 0.71185302734375, "rewards//std": 0.047321073710918427, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0872, "grad_norm": 6.04955530166626, "kl": 1.1332805044949055, "learning_rate": 9.851478631379982e-07, "loss": 0.1133, "num_tokens": 3769168.0, "reward": 0.735595703125, "reward_std": 0.013865230605006218, "rewards//mean": 0.735595703125, "rewards//std": 0.037858907133340836, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0874, "grad_norm": 4.328035831451416, "kl": 1.4628506265580654, "learning_rate": 9.850709957323854e-07, "loss": 0.1463, "num_tokens": 3777808.0, "reward": 0.73638916015625, "reward_std": 0.016544152051210403, "rewards//mean": 0.73638916015625, "rewards//std": 0.03291091322898865, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0876, "grad_norm": 6.915759563446045, "kl": 1.7527340091764927, "learning_rate": 9.849939329402336e-07, "loss": 0.1753, "num_tokens": 3786392.0, "reward": 0.7340087890625, "reward_std": 0.024360811337828636, "rewards//mean": 0.7340087890625, "rewards//std": 0.03484637290239334, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0878, "grad_norm": 5.501615524291992, "kl": 1.0954801924526691, "learning_rate": 9.849166747925834e-07, "loss": 0.1095, "num_tokens": 3795064.0, "reward": 0.76043701171875, "reward_std": 0.018962692469358444, "rewards//mean": 0.76043701171875, "rewards//std": 0.03665084391832352, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.088, "grad_norm": 5.874672889709473, "kl": 1.0125903934240341, "learning_rate": 9.848392213205547e-07, "loss": 0.1013, "num_tokens": 3803632.0, "reward": 0.74578857421875, "reward_std": 0.01319526880979538, "rewards//mean": 0.74578857421875, "rewards//std": 0.044539760798215866, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0882, "grad_norm": 6.564806938171387, "kl": 2.0192187782377005, "learning_rate": 9.847615725553455e-07, "loss": 0.2019, "num_tokens": 3812296.0, "reward": 0.7786865234375, "reward_std": 0.01627446338534355, "rewards//mean": 0.7786865234375, "rewards//std": 0.030171260237693787, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0884, "grad_norm": 4.285877704620361, "kl": 1.43904594425112, "learning_rate": 9.84683728528233e-07, "loss": 0.1439, "num_tokens": 3820952.0, "reward": 0.74932861328125, "reward_std": 0.013798095285892487, "rewards//mean": 0.74932861328125, "rewards//std": 0.03864205256104469, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0886, "grad_norm": 5.550078392028809, "kl": 2.14881101436913, "learning_rate": 9.846056892705727e-07, "loss": 0.2149, "num_tokens": 3829712.0, "reward": 0.701416015625, "reward_std": 0.019971946254372597, "rewards//mean": 0.701416015625, "rewards//std": 0.045945413410663605, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0888, "grad_norm": 5.137343406677246, "kl": 1.162135485559702, "learning_rate": 9.845274548137985e-07, "loss": 0.1162, "num_tokens": 3838296.0, "reward": 0.72705078125, "reward_std": 0.012567806988954544, "rewards//mean": 0.72705078125, "rewards//std": 0.01751773990690708, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.089, "grad_norm": 5.510349750518799, "kl": 1.5951459687203169, "learning_rate": 9.844490251894236e-07, "loss": 0.1595, "num_tokens": 3846960.0, "reward": 0.7442626953125, "reward_std": 0.01350520271807909, "rewards//mean": 0.7442626953125, "rewards//std": 0.03690710663795471, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0892, "grad_norm": 4.13537073135376, "kl": 1.2297241613268852, "learning_rate": 9.843704004290392e-07, "loss": 0.123, "num_tokens": 3855688.0, "reward": 0.74462890625, "reward_std": 0.009682174772024155, "rewards//mean": 0.74462890625, "rewards//std": 0.035038936883211136, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0894, "grad_norm": 4.711252212524414, "kl": 1.6171795912086964, "learning_rate": 9.842915805643156e-07, "loss": 0.1617, "num_tokens": 3864312.0, "reward": 0.75726318359375, "reward_std": 0.016375333070755005, "rewards//mean": 0.75726318359375, "rewards//std": 0.03374572843313217, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0896, "grad_norm": 6.262650489807129, "kl": 1.0032986970618367, "learning_rate": 9.84212565627001e-07, "loss": 0.1003, "num_tokens": 3872992.0, "reward": 0.7371826171875, "reward_std": 0.01670708693563938, "rewards//mean": 0.7371826171875, "rewards//std": 0.030293432995676994, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0898, "grad_norm": 5.053429126739502, "kl": 1.1675552818924189, "learning_rate": 9.841333556489232e-07, "loss": 0.1168, "num_tokens": 3881712.0, "reward": 0.76287841796875, "reward_std": 0.012518439441919327, "rewards//mean": 0.76287841796875, "rewards//std": 0.03140215948224068, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.09, "grad_norm": 5.673282146453857, "kl": 1.2919608671218157, "learning_rate": 9.840539506619872e-07, "loss": 0.1292, "num_tokens": 3890312.0, "reward": 0.7432861328125, "reward_std": 0.012161493301391602, "rewards//mean": 0.7432861328125, "rewards//std": 0.03512846678495407, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0902, "grad_norm": 3.9209952354431152, "kl": 1.5129262786358595, "learning_rate": 9.83974350698178e-07, "loss": 0.1513, "num_tokens": 3899016.0, "reward": 0.7255859375, "reward_std": 0.01528351753950119, "rewards//mean": 0.7255859375, "rewards//std": 0.0511353462934494, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0904, "grad_norm": 5.624383926391602, "kl": 1.4455570131540298, "learning_rate": 9.838945557895584e-07, "loss": 0.1446, "num_tokens": 3907752.0, "reward": 0.76983642578125, "reward_std": 0.015880919992923737, "rewards//mean": 0.76983642578125, "rewards//std": 0.027342019602656364, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0906, "grad_norm": 9.07286262512207, "kl": 1.7261950299143791, "learning_rate": 9.838145659682692e-07, "loss": 0.1726, "num_tokens": 3916496.0, "reward": 0.77105712890625, "reward_std": 0.009812546893954277, "rewards//mean": 0.77105712890625, "rewards//std": 0.03369365260004997, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0908, "grad_norm": 7.850827693939209, "kl": 2.364481531083584, "learning_rate": 9.83734381266531e-07, "loss": 0.2364, "num_tokens": 3925008.0, "reward": 0.72412109375, "reward_std": 0.01726219430565834, "rewards//mean": 0.72412109375, "rewards//std": 0.041582297533750534, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.091, "grad_norm": 5.526845932006836, "kl": 1.9436415508389473, "learning_rate": 9.836540017166419e-07, "loss": 0.1944, "num_tokens": 3933696.0, "reward": 0.7244873046875, "reward_std": 0.014361506327986717, "rewards//mean": 0.7244873046875, "rewards//std": 0.035631630569696426, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0912, "grad_norm": 5.372095584869385, "kl": 1.3827466797083616, "learning_rate": 9.835734273509785e-07, "loss": 0.1383, "num_tokens": 3942304.0, "reward": 0.77215576171875, "reward_std": 0.01689828932285309, "rewards//mean": 0.77215576171875, "rewards//std": 0.029504306614398956, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0914, "grad_norm": 6.444041728973389, "kl": 1.6388468109071255, "learning_rate": 9.834926582019966e-07, "loss": 0.1639, "num_tokens": 3950968.0, "reward": 0.74346923828125, "reward_std": 0.015370641835033894, "rewards//mean": 0.74346923828125, "rewards//std": 0.03465355932712555, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0916, "grad_norm": 5.43826150894165, "kl": 1.4846863243728876, "learning_rate": 9.834116943022297e-07, "loss": 0.1485, "num_tokens": 3959640.0, "reward": 0.7340087890625, "reward_std": 0.015341941267251968, "rewards//mean": 0.7340087890625, "rewards//std": 0.03952307254076004, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0918, "grad_norm": 6.569565773010254, "kl": 2.233142463490367, "learning_rate": 9.8333053568429e-07, "loss": 0.2233, "num_tokens": 3968256.0, "reward": 0.7635498046875, "reward_std": 0.01592983305454254, "rewards//mean": 0.7635498046875, "rewards//std": 0.030249428004026413, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.092, "grad_norm": 11.522577285766602, "kl": 2.4209519093856215, "learning_rate": 9.832491823808686e-07, "loss": 0.2421, "num_tokens": 3977024.0, "reward": 0.75799560546875, "reward_std": 0.015631375834345818, "rewards//mean": 0.75799560546875, "rewards//std": 0.0318368598818779, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0922, "grad_norm": 5.3075480461120605, "kl": 2.1683051753789186, "learning_rate": 9.831676344247342e-07, "loss": 0.2168, "num_tokens": 3985664.0, "reward": 0.71514892578125, "reward_std": 0.01411198079586029, "rewards//mean": 0.71514892578125, "rewards//std": 0.03985010087490082, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0924, "grad_norm": 4.104161262512207, "kl": 1.1081998217850924, "learning_rate": 9.830858918487346e-07, "loss": 0.1108, "num_tokens": 3994480.0, "reward": 0.75677490234375, "reward_std": 0.01155187003314495, "rewards//mean": 0.75677490234375, "rewards//std": 0.03030760958790779, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0926, "grad_norm": 6.832763195037842, "kl": 2.357150062918663, "learning_rate": 9.830039546857952e-07, "loss": 0.2357, "num_tokens": 4003112.0, "reward": 0.72491455078125, "reward_std": 0.020855844020843506, "rewards//mean": 0.72491455078125, "rewards//std": 0.051110733300447464, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0928, "grad_norm": 5.7236175537109375, "kl": 1.966838133521378, "learning_rate": 9.829218229689209e-07, "loss": 0.1967, "num_tokens": 4011720.0, "reward": 0.7279052734375, "reward_std": 0.022397905588150024, "rewards//mean": 0.7279052734375, "rewards//std": 0.047888245433568954, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.093, "grad_norm": 5.766880035400391, "kl": 1.824003990739584, "learning_rate": 9.828394967311938e-07, "loss": 0.1824, "num_tokens": 4020384.0, "reward": 0.7261962890625, "reward_std": 0.016093678772449493, "rewards//mean": 0.7261962890625, "rewards//std": 0.045839034020900726, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0932, "grad_norm": 4.534486293792725, "kl": 1.870192615315318, "learning_rate": 9.827569760057754e-07, "loss": 0.187, "num_tokens": 4029016.0, "reward": 0.75372314453125, "reward_std": 0.016078153625130653, "rewards//mean": 0.75372314453125, "rewards//std": 0.02742273360490799, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0934, "grad_norm": 7.217731952667236, "kl": 1.1967949345707893, "learning_rate": 9.826742608259047e-07, "loss": 0.1197, "num_tokens": 4037688.0, "reward": 0.7689208984375, "reward_std": 0.015779195353388786, "rewards//mean": 0.7689208984375, "rewards//std": 0.03237839788198471, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0936, "grad_norm": 5.504461288452148, "kl": 2.1448816806077957, "learning_rate": 9.825913512248995e-07, "loss": 0.2145, "num_tokens": 4046304.0, "reward": 0.734375, "reward_std": 0.01801781728863716, "rewards//mean": 0.734375, "rewards//std": 0.029626740142703056, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0938, "grad_norm": 4.837132453918457, "kl": 2.0805433094501495, "learning_rate": 9.825082472361556e-07, "loss": 0.2081, "num_tokens": 4054992.0, "reward": 0.70660400390625, "reward_std": 0.01611001417040825, "rewards//mean": 0.70660400390625, "rewards//std": 0.041034769266843796, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.094, "grad_norm": 5.750391960144043, "kl": 0.7883851593360305, "learning_rate": 9.824249488931475e-07, "loss": 0.0788, "num_tokens": 4063624.0, "reward": 0.74493408203125, "reward_std": 0.009561952203512192, "rewards//mean": 0.74493408203125, "rewards//std": 0.027793388813734055, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0942, "grad_norm": 4.606777191162109, "kl": 0.9498005639761686, "learning_rate": 9.82341456229428e-07, "loss": 0.095, "num_tokens": 4072256.0, "reward": 0.77325439453125, "reward_std": 0.0117096658796072, "rewards//mean": 0.77325439453125, "rewards//std": 0.031346190720796585, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0944, "grad_norm": 5.636474132537842, "kl": 1.744721632450819, "learning_rate": 9.822577692786272e-07, "loss": 0.1745, "num_tokens": 4080920.0, "reward": 0.74176025390625, "reward_std": 0.016624407842755318, "rewards//mean": 0.74176025390625, "rewards//std": 0.030600877478718758, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0946, "grad_norm": 6.466523170471191, "kl": 2.1718217339366674, "learning_rate": 9.821738880744547e-07, "loss": 0.2172, "num_tokens": 4089496.0, "reward": 0.7296142578125, "reward_std": 0.019219931215047836, "rewards//mean": 0.7296142578125, "rewards//std": 0.04213757440447807, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0948, "grad_norm": 4.043447971343994, "kl": 1.4767930824309587, "learning_rate": 9.820898126506979e-07, "loss": 0.1477, "num_tokens": 4098072.0, "reward": 0.7376708984375, "reward_std": 0.02094002068042755, "rewards//mean": 0.7376708984375, "rewards//std": 0.03580961748957634, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.095, "grad_norm": 7.309907913208008, "kl": 1.1765703689306974, "learning_rate": 9.820055430412219e-07, "loss": 0.1177, "num_tokens": 4106672.0, "reward": 0.705078125, "reward_std": 0.014730259776115417, "rewards//mean": 0.705078125, "rewards//std": 0.05473177134990692, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0952, "grad_norm": 6.035542011260986, "kl": 1.0523865576833487, "learning_rate": 9.81921079279971e-07, "loss": 0.1052, "num_tokens": 4115384.0, "reward": 0.74176025390625, "reward_std": 0.0114174485206604, "rewards//mean": 0.74176025390625, "rewards//std": 0.026172636076807976, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0954, "grad_norm": 5.155995845794678, "kl": 1.7892052046954632, "learning_rate": 9.81836421400967e-07, "loss": 0.1789, "num_tokens": 4124064.0, "reward": 0.73919677734375, "reward_std": 0.014689266681671143, "rewards//mean": 0.73919677734375, "rewards//std": 0.039943818002939224, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0956, "grad_norm": 5.833273410797119, "kl": 1.76816301420331, "learning_rate": 9.817515694383102e-07, "loss": 0.1768, "num_tokens": 4132704.0, "reward": 0.738037109375, "reward_std": 0.026206418871879578, "rewards//mean": 0.738037109375, "rewards//std": 0.05074487254023552, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0958, "grad_norm": 3.475642681121826, "kl": 1.7420800495892763, "learning_rate": 9.816665234261786e-07, "loss": 0.1742, "num_tokens": 4141304.0, "reward": 0.7596435546875, "reward_std": 0.015064358711242676, "rewards//mean": 0.7596435546875, "rewards//std": 0.03991486132144928, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.096, "grad_norm": 4.941133499145508, "kl": 1.792766097933054, "learning_rate": 9.81581283398829e-07, "loss": 0.1793, "num_tokens": 4149840.0, "reward": 0.764404296875, "reward_std": 0.020246122032403946, "rewards//mean": 0.764404296875, "rewards//std": 0.04281841218471527, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0962, "grad_norm": 6.210538387298584, "kl": 1.294305069372058, "learning_rate": 9.814958493905962e-07, "loss": 0.1294, "num_tokens": 4158448.0, "reward": 0.71954345703125, "reward_std": 0.01249985583126545, "rewards//mean": 0.71954345703125, "rewards//std": 0.03835659101605415, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0964, "grad_norm": 8.487665176391602, "kl": 1.755100229755044, "learning_rate": 9.814102214358926e-07, "loss": 0.1755, "num_tokens": 4167072.0, "reward": 0.73297119140625, "reward_std": 0.011209162883460522, "rewards//mean": 0.73297119140625, "rewards//std": 0.04069247841835022, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0966, "grad_norm": 5.701470375061035, "kl": 1.4351928923279047, "learning_rate": 9.813243995692097e-07, "loss": 0.1435, "num_tokens": 4175656.0, "reward": 0.76092529296875, "reward_std": 0.01819439046084881, "rewards//mean": 0.76092529296875, "rewards//std": 0.035633064806461334, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0968, "grad_norm": 6.038917541503906, "kl": 1.1271718349307775, "learning_rate": 9.81238383825116e-07, "loss": 0.1127, "num_tokens": 4184232.0, "reward": 0.7322998046875, "reward_std": 0.013266301713883877, "rewards//mean": 0.7322998046875, "rewards//std": 0.02969997003674507, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.097, "grad_norm": 8.786454200744629, "kl": 1.132724966853857, "learning_rate": 9.81152174238259e-07, "loss": 0.1133, "num_tokens": 4192904.0, "reward": 0.7191162109375, "reward_std": 0.010953761637210846, "rewards//mean": 0.7191162109375, "rewards//std": 0.03941875696182251, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0972, "grad_norm": 6.407580852508545, "kl": 2.186642337590456, "learning_rate": 9.810657708433635e-07, "loss": 0.2187, "num_tokens": 4201520.0, "reward": 0.7320556640625, "reward_std": 0.01584211178123951, "rewards//mean": 0.7320556640625, "rewards//std": 0.032352205365896225, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0974, "grad_norm": 5.325110912322998, "kl": 1.8596960436552763, "learning_rate": 9.809791736752332e-07, "loss": 0.186, "num_tokens": 4210128.0, "reward": 0.75628662109375, "reward_std": 0.012583386152982712, "rewards//mean": 0.75628662109375, "rewards//std": 0.03423561155796051, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0976, "grad_norm": 3.9019405841827393, "kl": 1.585777211934328, "learning_rate": 9.808923827687492e-07, "loss": 0.1586, "num_tokens": 4218696.0, "reward": 0.7193603515625, "reward_std": 0.013766135089099407, "rewards//mean": 0.7193603515625, "rewards//std": 0.036146216094493866, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0978, "grad_norm": 6.00661563873291, "kl": 1.2772408034652472, "learning_rate": 9.80805398158871e-07, "loss": 0.1277, "num_tokens": 4227304.0, "reward": 0.76727294921875, "reward_std": 0.02176658809185028, "rewards//mean": 0.76727294921875, "rewards//std": 0.04012569040060043, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.098, "grad_norm": 5.746229648590088, "kl": 2.446054134517908, "learning_rate": 9.80718219880636e-07, "loss": 0.2446, "num_tokens": 4235968.0, "reward": 0.7274169921875, "reward_std": 0.0224794652312994, "rewards//mean": 0.7274169921875, "rewards//std": 0.04227529838681221, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0982, "grad_norm": 4.4794697761535645, "kl": 1.500032465904951, "learning_rate": 9.806308479691594e-07, "loss": 0.15, "num_tokens": 4244616.0, "reward": 0.73822021484375, "reward_std": 0.014269332401454449, "rewards//mean": 0.73822021484375, "rewards//std": 0.03540334478020668, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0984, "grad_norm": 4.810574531555176, "kl": 1.8452752772718668, "learning_rate": 9.805432824596347e-07, "loss": 0.1845, "num_tokens": 4253168.0, "reward": 0.71533203125, "reward_std": 0.015700260177254677, "rewards//mean": 0.71533203125, "rewards//std": 0.029907453805208206, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0986, "grad_norm": 4.90366268157959, "kl": 1.7375615183264017, "learning_rate": 9.804555233873332e-07, "loss": 0.1738, "num_tokens": 4261768.0, "reward": 0.7515869140625, "reward_std": 0.013618829660117626, "rewards//mean": 0.7515869140625, "rewards//std": 0.032613176852464676, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0988, "grad_norm": 6.374400615692139, "kl": 1.3192815203219652, "learning_rate": 9.803675707876048e-07, "loss": 0.1319, "num_tokens": 4270328.0, "reward": 0.72662353515625, "reward_std": 0.011379792355000973, "rewards//mean": 0.72662353515625, "rewards//std": 0.038935136049985886, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.099, "grad_norm": 6.761526584625244, "kl": 1.7310762237757444, "learning_rate": 9.80279424695876e-07, "loss": 0.1731, "num_tokens": 4279016.0, "reward": 0.7420654296875, "reward_std": 0.019092299044132233, "rewards//mean": 0.7420654296875, "rewards//std": 0.03185616806149483, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0992, "grad_norm": 4.999700546264648, "kl": 1.366264495998621, "learning_rate": 9.801910851476524e-07, "loss": 0.1366, "num_tokens": 4287632.0, "reward": 0.7283935546875, "reward_std": 0.012258632108569145, "rewards//mean": 0.7283935546875, "rewards//std": 0.04211745038628578, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0994, "grad_norm": 10.183734893798828, "kl": 1.0184291638433933, "learning_rate": 9.80102552178517e-07, "loss": 0.1018, "num_tokens": 4296256.0, "reward": 0.72607421875, "reward_std": 0.010940630920231342, "rewards//mean": 0.72607421875, "rewards//std": 0.029712455347180367, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0996, "grad_norm": 4.5478925704956055, "kl": 1.484832838177681, "learning_rate": 9.800138258241309e-07, "loss": 0.1485, "num_tokens": 4304944.0, "reward": 0.75970458984375, "reward_std": 0.01730983518064022, "rewards//mean": 0.75970458984375, "rewards//std": 0.04647018760442734, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.0998, "grad_norm": 7.538711071014404, "kl": 1.5070508643984795, "learning_rate": 9.799249061202334e-07, "loss": 0.1507, "num_tokens": 4313560.0, "reward": 0.7239990234375, "reward_std": 0.011308427900075912, "rewards//mean": 0.7239990234375, "rewards//std": 0.038109779357910156, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1, "grad_norm": 8.800752639770508, "kl": 1.108166430145502, "learning_rate": 9.798357931026412e-07, "loss": 0.1108, "num_tokens": 4322208.0, "reward": 0.75616455078125, "reward_std": 0.01142922230064869, "rewards//mean": 0.75616455078125, "rewards//std": 0.027733413502573967, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1002, "grad_norm": 6.367660999298096, "kl": 2.1931713595986366, "learning_rate": 9.797464868072486e-07, "loss": 0.2193, "num_tokens": 4330888.0, "reward": 0.740966796875, "reward_std": 0.017630886286497116, "rewards//mean": 0.740966796875, "rewards//std": 0.03819647058844566, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1004, "grad_norm": 8.521171569824219, "kl": 1.3009831812232733, "learning_rate": 9.796569872700287e-07, "loss": 0.1301, "num_tokens": 4339480.0, "reward": 0.72418212890625, "reward_std": 0.015889937058091164, "rewards//mean": 0.72418212890625, "rewards//std": 0.04660874977707863, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1006, "grad_norm": 6.808535575866699, "kl": 2.014758253470063, "learning_rate": 9.795672945270316e-07, "loss": 0.2015, "num_tokens": 4348208.0, "reward": 0.70208740234375, "reward_std": 0.014105882495641708, "rewards//mean": 0.70208740234375, "rewards//std": 0.04658601060509682, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1008, "grad_norm": 8.12624454498291, "kl": 1.96061559766531, "learning_rate": 9.794774086143857e-07, "loss": 0.1961, "num_tokens": 4356904.0, "reward": 0.74176025390625, "reward_std": 0.016262296587228775, "rewards//mean": 0.74176025390625, "rewards//std": 0.034033387899398804, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.101, "grad_norm": 5.099590301513672, "kl": 1.2936958279460669, "learning_rate": 9.79387329568297e-07, "loss": 0.1294, "num_tokens": 4365528.0, "reward": 0.74072265625, "reward_std": 0.01925729401409626, "rewards//mean": 0.74072265625, "rewards//std": 0.0370873399078846, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1012, "grad_norm": 5.462556838989258, "kl": 1.5399411581456661, "learning_rate": 9.792970574250493e-07, "loss": 0.154, "num_tokens": 4374120.0, "reward": 0.71966552734375, "reward_std": 0.01657002419233322, "rewards//mean": 0.71966552734375, "rewards//std": 0.03408760949969292, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1014, "grad_norm": 5.555461406707764, "kl": 1.4309385670349002, "learning_rate": 9.79206592221004e-07, "loss": 0.1431, "num_tokens": 4382808.0, "reward": 0.74737548828125, "reward_std": 0.018372822552919388, "rewards//mean": 0.74737548828125, "rewards//std": 0.03450646996498108, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1016, "grad_norm": 10.545476913452148, "kl": 2.4767883997410536, "learning_rate": 9.791159339926008e-07, "loss": 0.2477, "num_tokens": 4391536.0, "reward": 0.76025390625, "reward_std": 0.01708972081542015, "rewards//mean": 0.76025390625, "rewards//std": 0.03743833303451538, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1018, "grad_norm": 5.856667995452881, "kl": 1.4919790271669626, "learning_rate": 9.790250827763565e-07, "loss": 0.1492, "num_tokens": 4400152.0, "reward": 0.747802734375, "reward_std": 0.010662312619388103, "rewards//mean": 0.747802734375, "rewards//std": 0.04489491134881973, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.102, "grad_norm": 6.639697074890137, "kl": 1.729028050787747, "learning_rate": 9.789340386088662e-07, "loss": 0.1729, "num_tokens": 4408712.0, "reward": 0.7100830078125, "reward_std": 0.01790030673146248, "rewards//mean": 0.7100830078125, "rewards//std": 0.0462532602250576, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1022, "grad_norm": 6.8193745613098145, "kl": 1.2193750012665987, "learning_rate": 9.788428015268026e-07, "loss": 0.1219, "num_tokens": 4417376.0, "reward": 0.74932861328125, "reward_std": 0.016750093549489975, "rewards//mean": 0.74932861328125, "rewards//std": 0.03208032250404358, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1024, "grad_norm": 15.178659439086914, "kl": 2.2706920113414526, "learning_rate": 9.787513715669157e-07, "loss": 0.2271, "num_tokens": 4426096.0, "reward": 0.7271728515625, "reward_std": 0.015744894742965698, "rewards//mean": 0.7271728515625, "rewards//std": 0.04813040420413017, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1026, "grad_norm": 7.991171360015869, "kl": 2.079538142308593, "learning_rate": 9.786597487660335e-07, "loss": 0.208, "num_tokens": 4434696.0, "reward": 0.72113037109375, "reward_std": 0.015818912535905838, "rewards//mean": 0.72113037109375, "rewards//std": 0.044497936964035034, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1028, "grad_norm": 8.265140533447266, "kl": 2.0102353263646364, "learning_rate": 9.78567933161062e-07, "loss": 0.201, "num_tokens": 4443352.0, "reward": 0.7506103515625, "reward_std": 0.012508335523307323, "rewards//mean": 0.7506103515625, "rewards//std": 0.03863367810845375, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.103, "grad_norm": 7.213916301727295, "kl": 1.9426716212183237, "learning_rate": 9.78475924788984e-07, "loss": 0.1943, "num_tokens": 4452008.0, "reward": 0.74835205078125, "reward_std": 0.02259785309433937, "rewards//mean": 0.74835205078125, "rewards//std": 0.04443359375, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1032, "grad_norm": 6.833401203155518, "kl": 1.6954651195555925, "learning_rate": 9.783837236868609e-07, "loss": 0.1695, "num_tokens": 4460584.0, "reward": 0.744384765625, "reward_std": 0.01196884922683239, "rewards//mean": 0.744384765625, "rewards//std": 0.022955898195505142, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1034, "grad_norm": 5.521451473236084, "kl": 1.6248896569013596, "learning_rate": 9.782913298918308e-07, "loss": 0.1625, "num_tokens": 4469208.0, "reward": 0.75726318359375, "reward_std": 0.0237879641354084, "rewards//mean": 0.75726318359375, "rewards//std": 0.03943195566534996, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1036, "grad_norm": 6.402383804321289, "kl": 1.8084558583796024, "learning_rate": 9.781987434411106e-07, "loss": 0.1808, "num_tokens": 4477880.0, "reward": 0.73272705078125, "reward_std": 0.012692469172179699, "rewards//mean": 0.73272705078125, "rewards//std": 0.03106970526278019, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1038, "grad_norm": 5.475217819213867, "kl": 1.6366539895534515, "learning_rate": 9.781059643719936e-07, "loss": 0.1637, "num_tokens": 4486520.0, "reward": 0.745361328125, "reward_std": 0.020745567977428436, "rewards//mean": 0.745361328125, "rewards//std": 0.04173563793301582, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.104, "grad_norm": 6.359858989715576, "kl": 1.1945130750536919, "learning_rate": 9.780129927218511e-07, "loss": 0.1195, "num_tokens": 4495064.0, "reward": 0.74407958984375, "reward_std": 0.020024165511131287, "rewards//mean": 0.74407958984375, "rewards//std": 0.03967764973640442, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1042, "grad_norm": 5.116205215454102, "kl": 1.5144299790263176, "learning_rate": 9.779198285281326e-07, "loss": 0.1514, "num_tokens": 4503728.0, "reward": 0.71844482421875, "reward_std": 0.01762961782515049, "rewards//mean": 0.71844482421875, "rewards//std": 0.04398159682750702, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1044, "grad_norm": 4.483882904052734, "kl": 1.4487113784998655, "learning_rate": 9.77826471828364e-07, "loss": 0.1449, "num_tokens": 4512344.0, "reward": 0.75860595703125, "reward_std": 0.01794547028839588, "rewards//mean": 0.75860595703125, "rewards//std": 0.036954399198293686, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1046, "grad_norm": 5.407111167907715, "kl": 1.382027082145214, "learning_rate": 9.777329226601501e-07, "loss": 0.1382, "num_tokens": 4520896.0, "reward": 0.73834228515625, "reward_std": 0.01703845150768757, "rewards//mean": 0.73834228515625, "rewards//std": 0.03187534958124161, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1048, "grad_norm": 5.94820499420166, "kl": 1.213211888447404, "learning_rate": 9.776391810611718e-07, "loss": 0.1213, "num_tokens": 4529480.0, "reward": 0.72613525390625, "reward_std": 0.018442852422595024, "rewards//mean": 0.72613525390625, "rewards//std": 0.031436365097761154, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.105, "grad_norm": 4.362919330596924, "kl": 1.3717845249921083, "learning_rate": 9.775452470691885e-07, "loss": 0.1372, "num_tokens": 4538064.0, "reward": 0.74176025390625, "reward_std": 0.01696932688355446, "rewards//mean": 0.74176025390625, "rewards//std": 0.03191047161817551, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1052, "grad_norm": 6.720903396606445, "kl": 1.71204486861825, "learning_rate": 9.774511207220368e-07, "loss": 0.1712, "num_tokens": 4546688.0, "reward": 0.76171875, "reward_std": 0.01861170306801796, "rewards//mean": 0.76171875, "rewards//std": 0.04062476381659508, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1054, "grad_norm": 5.4875311851501465, "kl": 1.5787024535238743, "learning_rate": 9.77356802057631e-07, "loss": 0.1579, "num_tokens": 4555432.0, "reward": 0.7608642578125, "reward_std": 0.02097097411751747, "rewards//mean": 0.7608642578125, "rewards//std": 0.03687756508588791, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1056, "grad_norm": 7.605865955352783, "kl": 1.2236286401748657, "learning_rate": 9.77262291113962e-07, "loss": 0.1224, "num_tokens": 4564064.0, "reward": 0.75970458984375, "reward_std": 0.019591055810451508, "rewards//mean": 0.75970458984375, "rewards//std": 0.03574715927243233, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1058, "grad_norm": 4.3834943771362305, "kl": 1.65463693626225, "learning_rate": 9.771675879290996e-07, "loss": 0.1655, "num_tokens": 4572752.0, "reward": 0.74310302734375, "reward_std": 0.010753463953733444, "rewards//mean": 0.74310302734375, "rewards//std": 0.03205152601003647, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.106, "grad_norm": 8.616353988647461, "kl": 1.006358283571899, "learning_rate": 9.770726925411897e-07, "loss": 0.1006, "num_tokens": 4581432.0, "reward": 0.7774658203125, "reward_std": 0.017439065501093864, "rewards//mean": 0.7774658203125, "rewards//std": 0.031458329409360886, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1062, "grad_norm": 8.021350860595703, "kl": 1.4562967214733362, "learning_rate": 9.769776049884563e-07, "loss": 0.1456, "num_tokens": 4590056.0, "reward": 0.740234375, "reward_std": 0.020591605454683304, "rewards//mean": 0.740234375, "rewards//std": 0.03929740935564041, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1064, "grad_norm": 5.052854537963867, "kl": 1.7414503898471594, "learning_rate": 9.768823253092008e-07, "loss": 0.1741, "num_tokens": 4598776.0, "reward": 0.74359130859375, "reward_std": 0.01788424886763096, "rewards//mean": 0.74359130859375, "rewards//std": 0.029954928904771805, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1066, "grad_norm": 5.531140327453613, "kl": 1.4807850709185004, "learning_rate": 9.767868535418014e-07, "loss": 0.1481, "num_tokens": 4607464.0, "reward": 0.76080322265625, "reward_std": 0.017598077654838562, "rewards//mean": 0.76080322265625, "rewards//std": 0.0396089144051075, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1068, "grad_norm": 7.237030982971191, "kl": 1.5943659655749798, "learning_rate": 9.766911897247146e-07, "loss": 0.1594, "num_tokens": 4616104.0, "reward": 0.7220458984375, "reward_std": 0.010353684425354004, "rewards//mean": 0.7220458984375, "rewards//std": 0.03670475631952286, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.107, "grad_norm": 4.178348541259766, "kl": 1.9010947477072477, "learning_rate": 9.765953338964734e-07, "loss": 0.1901, "num_tokens": 4624760.0, "reward": 0.7391357421875, "reward_std": 0.017037052661180496, "rewards//mean": 0.7391357421875, "rewards//std": 0.040835678577423096, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1072, "grad_norm": 5.015347480773926, "kl": 1.747239861637354, "learning_rate": 9.76499286095689e-07, "loss": 0.1747, "num_tokens": 4633392.0, "reward": 0.74774169921875, "reward_std": 0.020298104733228683, "rewards//mean": 0.74774169921875, "rewards//std": 0.039234522730112076, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1074, "grad_norm": 6.626712322235107, "kl": 2.1457751411944628, "learning_rate": 9.764030463610488e-07, "loss": 0.2146, "num_tokens": 4642072.0, "reward": 0.748046875, "reward_std": 0.02151212841272354, "rewards//mean": 0.748046875, "rewards//std": 0.040012020617723465, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1076, "grad_norm": 14.374396324157715, "kl": 2.592175643891096, "learning_rate": 9.763066147313189e-07, "loss": 0.2592, "num_tokens": 4650720.0, "reward": 0.75128173828125, "reward_std": 0.015068383887410164, "rewards//mean": 0.75128173828125, "rewards//std": 0.03822930157184601, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1078, "grad_norm": 4.086817264556885, "kl": 1.9288067817687988, "learning_rate": 9.762099912453412e-07, "loss": 0.1929, "num_tokens": 4659312.0, "reward": 0.75177001953125, "reward_std": 0.01850154623389244, "rewards//mean": 0.75177001953125, "rewards//std": 0.03926614671945572, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.108, "grad_norm": 5.19843864440918, "kl": 1.9183164574205875, "learning_rate": 9.76113175942036e-07, "loss": 0.1918, "num_tokens": 4667896.0, "reward": 0.7498779296875, "reward_std": 0.014849013648927212, "rewards//mean": 0.7498779296875, "rewards//std": 0.03451990336179733, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1082, "grad_norm": 6.632591247558594, "kl": 1.886117585003376, "learning_rate": 9.760161688604007e-07, "loss": 0.1886, "num_tokens": 4676488.0, "reward": 0.7457275390625, "reward_std": 0.019388720393180847, "rewards//mean": 0.7457275390625, "rewards//std": 0.032790929079055786, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1084, "grad_norm": 8.33913803100586, "kl": 2.0020735822618008, "learning_rate": 9.759189700395095e-07, "loss": 0.2002, "num_tokens": 4685128.0, "reward": 0.74432373046875, "reward_std": 0.01459340751171112, "rewards//mean": 0.74432373046875, "rewards//std": 0.0429711751639843, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1086, "grad_norm": 5.022960186004639, "kl": 2.1866206601262093, "learning_rate": 9.758215795185138e-07, "loss": 0.2187, "num_tokens": 4693824.0, "reward": 0.75189208984375, "reward_std": 0.015742268413305283, "rewards//mean": 0.75189208984375, "rewards//std": 0.0536862388253212, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1088, "grad_norm": 4.4368062019348145, "kl": 1.8616263028234243, "learning_rate": 9.757239973366428e-07, "loss": 0.1862, "num_tokens": 4702472.0, "reward": 0.74822998046875, "reward_std": 0.015865590423345566, "rewards//mean": 0.74822998046875, "rewards//std": 0.04504062980413437, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.109, "grad_norm": 4.997870922088623, "kl": 2.5523458272218704, "learning_rate": 9.756262235332028e-07, "loss": 0.2552, "num_tokens": 4711104.0, "reward": 0.75885009765625, "reward_std": 0.023846883326768875, "rewards//mean": 0.75885009765625, "rewards//std": 0.03539479151368141, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1092, "grad_norm": 10.336973190307617, "kl": 3.0368824899196625, "learning_rate": 9.755282581475767e-07, "loss": 0.3037, "num_tokens": 4719688.0, "reward": 0.70037841796875, "reward_std": 0.01915045827627182, "rewards//mean": 0.70037841796875, "rewards//std": 0.047834914177656174, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1094, "grad_norm": 6.205177307128906, "kl": 1.4204385522753, "learning_rate": 9.754301012192253e-07, "loss": 0.142, "num_tokens": 4728272.0, "reward": 0.7509765625, "reward_std": 0.019058480858802795, "rewards//mean": 0.7509765625, "rewards//std": 0.03560464084148407, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1096, "grad_norm": 9.930331230163574, "kl": 2.1349884532392025, "learning_rate": 9.753317527876856e-07, "loss": 0.2135, "num_tokens": 4736888.0, "reward": 0.7347412109375, "reward_std": 0.011325545608997345, "rewards//mean": 0.7347412109375, "rewards//std": 0.035019706934690475, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1098, "grad_norm": 15.23091983795166, "kl": 2.7829357124865055, "learning_rate": 9.75233212892573e-07, "loss": 0.2783, "num_tokens": 4745456.0, "reward": 0.7529296875, "reward_std": 0.016158465296030045, "rewards//mean": 0.7529296875, "rewards//std": 0.035781074315309525, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.11, "grad_norm": 7.015782833099365, "kl": 2.0335309226065874, "learning_rate": 9.75134481573579e-07, "loss": 0.2034, "num_tokens": 4754000.0, "reward": 0.77471923828125, "reward_std": 0.013891384936869144, "rewards//mean": 0.77471923828125, "rewards//std": 0.028646297752857208, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1102, "grad_norm": 4.9335432052612305, "kl": 2.393045909702778, "learning_rate": 9.750355588704727e-07, "loss": 0.2393, "num_tokens": 4762808.0, "reward": 0.742919921875, "reward_std": 0.020512012764811516, "rewards//mean": 0.742919921875, "rewards//std": 0.041292235255241394, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1104, "grad_norm": 11.090091705322266, "kl": 2.3459647679701447, "learning_rate": 9.749364448231e-07, "loss": 0.2346, "num_tokens": 4771488.0, "reward": 0.73565673828125, "reward_std": 0.017126431688666344, "rewards//mean": 0.73565673828125, "rewards//std": 0.03207654878497124, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1106, "grad_norm": 13.620132446289062, "kl": 2.8425038745626807, "learning_rate": 9.748371394713841e-07, "loss": 0.2843, "num_tokens": 4780144.0, "reward": 0.72735595703125, "reward_std": 0.016672534868121147, "rewards//mean": 0.72735595703125, "rewards//std": 0.039587125182151794, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1108, "grad_norm": 6.813229084014893, "kl": 2.043930523097515, "learning_rate": 9.747376428553253e-07, "loss": 0.2044, "num_tokens": 4788744.0, "reward": 0.703857421875, "reward_std": 0.015327699482440948, "rewards//mean": 0.703857421875, "rewards//std": 0.046113792806863785, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.111, "grad_norm": 6.221364974975586, "kl": 2.4740346297621727, "learning_rate": 9.746379550150008e-07, "loss": 0.2474, "num_tokens": 4797400.0, "reward": 0.73052978515625, "reward_std": 0.022129859775304794, "rewards//mean": 0.73052978515625, "rewards//std": 0.037217672914266586, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1112, "grad_norm": 7.535991668701172, "kl": 2.2564044035971165, "learning_rate": 9.745380759905647e-07, "loss": 0.2256, "num_tokens": 4806144.0, "reward": 0.7777099609375, "reward_std": 0.015177038498222828, "rewards//mean": 0.7777099609375, "rewards//std": 0.036001864820718765, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1114, "grad_norm": 4.758485317230225, "kl": 2.0595958326011896, "learning_rate": 9.744380058222482e-07, "loss": 0.206, "num_tokens": 4814792.0, "reward": 0.7476806640625, "reward_std": 0.023326821625232697, "rewards//mean": 0.7476806640625, "rewards//std": 0.049005601555109024, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1116, "grad_norm": 4.56634521484375, "kl": 1.2647319380193949, "learning_rate": 9.743377445503597e-07, "loss": 0.1265, "num_tokens": 4823488.0, "reward": 0.7706298828125, "reward_std": 0.014785964973270893, "rewards//mean": 0.7706298828125, "rewards//std": 0.03946634382009506, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1118, "grad_norm": 6.108590126037598, "kl": 1.7404382824897766, "learning_rate": 9.742372922152845e-07, "loss": 0.174, "num_tokens": 4832160.0, "reward": 0.76446533203125, "reward_std": 0.020382845774292946, "rewards//mean": 0.76446533203125, "rewards//std": 0.045194968581199646, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.112, "grad_norm": 4.485249996185303, "kl": 1.4466899689286947, "learning_rate": 9.74136648857485e-07, "loss": 0.1447, "num_tokens": 4840776.0, "reward": 0.713134765625, "reward_std": 0.014563288539648056, "rewards//mean": 0.713134765625, "rewards//std": 0.04398473724722862, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1122, "grad_norm": 5.07109260559082, "kl": 1.5551688242703676, "learning_rate": 9.740358145174997e-07, "loss": 0.1555, "num_tokens": 4849480.0, "reward": 0.760009765625, "reward_std": 0.021132897585630417, "rewards//mean": 0.760009765625, "rewards//std": 0.04068359360098839, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1124, "grad_norm": 4.8608880043029785, "kl": 1.7707404978573322, "learning_rate": 9.73934789235945e-07, "loss": 0.1771, "num_tokens": 4858184.0, "reward": 0.74798583984375, "reward_std": 0.026072677224874496, "rewards//mean": 0.74798583984375, "rewards//std": 0.049194399267435074, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1126, "grad_norm": 3.8644888401031494, "kl": 1.1087552718818188, "learning_rate": 9.73833573053514e-07, "loss": 0.1109, "num_tokens": 4866856.0, "reward": 0.763427734375, "reward_std": 0.016999846324324608, "rewards//mean": 0.763427734375, "rewards//std": 0.03900587931275368, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1128, "grad_norm": 5.894697666168213, "kl": 1.2242057928815484, "learning_rate": 9.737321660109766e-07, "loss": 0.1224, "num_tokens": 4875376.0, "reward": 0.727783203125, "reward_std": 0.021369129419326782, "rewards//mean": 0.727783203125, "rewards//std": 0.04520672932267189, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.113, "grad_norm": 5.239418983459473, "kl": 1.495858235284686, "learning_rate": 9.73630568149179e-07, "loss": 0.1496, "num_tokens": 4884008.0, "reward": 0.71240234375, "reward_std": 0.02082645893096924, "rewards//mean": 0.71240234375, "rewards//std": 0.04694470390677452, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1132, "grad_norm": 4.3944268226623535, "kl": 1.3522299639880657, "learning_rate": 9.735287795090454e-07, "loss": 0.1352, "num_tokens": 4892672.0, "reward": 0.75726318359375, "reward_std": 0.018662169575691223, "rewards//mean": 0.75726318359375, "rewards//std": 0.04031875729560852, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1134, "grad_norm": 4.5940046310424805, "kl": 1.7546975184231997, "learning_rate": 9.734268001315759e-07, "loss": 0.1755, "num_tokens": 4901272.0, "reward": 0.71612548828125, "reward_std": 0.021254749968647957, "rewards//mean": 0.71612548828125, "rewards//std": 0.05108821764588356, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1136, "grad_norm": 7.10288667678833, "kl": 0.663728054612875, "learning_rate": 9.733246300578482e-07, "loss": 0.0664, "num_tokens": 4909840.0, "reward": 0.75848388671875, "reward_std": 0.010412106290459633, "rewards//mean": 0.75848388671875, "rewards//std": 0.02831890992820263, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1138, "grad_norm": 7.828158855438232, "kl": 1.429208105430007, "learning_rate": 9.73222269329016e-07, "loss": 0.1429, "num_tokens": 4918384.0, "reward": 0.73187255859375, "reward_std": 0.019287483766674995, "rewards//mean": 0.73187255859375, "rewards//std": 0.026887016370892525, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.114, "grad_norm": 4.482600212097168, "kl": 1.6363477557897568, "learning_rate": 9.731197179863103e-07, "loss": 0.1636, "num_tokens": 4927096.0, "reward": 0.7735595703125, "reward_std": 0.015531946904957294, "rewards//mean": 0.7735595703125, "rewards//std": 0.043442267924547195, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1142, "grad_norm": 7.246872425079346, "kl": 1.1119504496455193, "learning_rate": 9.730169760710385e-07, "loss": 0.1112, "num_tokens": 4935776.0, "reward": 0.73040771484375, "reward_std": 0.014359238557517529, "rewards//mean": 0.73040771484375, "rewards//std": 0.035939738154411316, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1144, "grad_norm": 6.416889667510986, "kl": 1.7196300886571407, "learning_rate": 9.729140436245856e-07, "loss": 0.172, "num_tokens": 4944400.0, "reward": 0.72698974609375, "reward_std": 0.021922361105680466, "rewards//mean": 0.72698974609375, "rewards//std": 0.04413449019193649, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1146, "grad_norm": 4.504310131072998, "kl": 1.8936888501048088, "learning_rate": 9.728109206884125e-07, "loss": 0.1894, "num_tokens": 4953000.0, "reward": 0.7447509765625, "reward_std": 0.024887755513191223, "rewards//mean": 0.7447509765625, "rewards//std": 0.04389292374253273, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1148, "grad_norm": 5.786618709564209, "kl": 1.2161879613995552, "learning_rate": 9.72707607304057e-07, "loss": 0.1216, "num_tokens": 4961696.0, "reward": 0.75347900390625, "reward_std": 0.01602158695459366, "rewards//mean": 0.75347900390625, "rewards//std": 0.03863930702209473, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.115, "grad_norm": 5.082529544830322, "kl": 1.2490643374621868, "learning_rate": 9.726041035131338e-07, "loss": 0.1249, "num_tokens": 4970296.0, "reward": 0.7366943359375, "reward_std": 0.013393068686127663, "rewards//mean": 0.7366943359375, "rewards//std": 0.03220964968204498, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1152, "grad_norm": 5.343238830566406, "kl": 1.8169043827801943, "learning_rate": 9.72500409357334e-07, "loss": 0.1817, "num_tokens": 4979040.0, "reward": 0.742431640625, "reward_std": 0.01911771297454834, "rewards//mean": 0.742431640625, "rewards//std": 0.05153579264879227, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1154, "grad_norm": 7.661114692687988, "kl": 1.20258454605937, "learning_rate": 9.723965248784262e-07, "loss": 0.1203, "num_tokens": 4987720.0, "reward": 0.782958984375, "reward_std": 0.018420346081256866, "rewards//mean": 0.782958984375, "rewards//std": 0.028466660529375076, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1156, "grad_norm": 5.413031101226807, "kl": 1.6013918295502663, "learning_rate": 9.722924501182546e-07, "loss": 0.1601, "num_tokens": 4996352.0, "reward": 0.71514892578125, "reward_std": 0.013243299908936024, "rewards//mean": 0.71514892578125, "rewards//std": 0.039661239832639694, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1158, "grad_norm": 5.931016445159912, "kl": 1.7467154283076525, "learning_rate": 9.721881851187405e-07, "loss": 0.1747, "num_tokens": 5005168.0, "reward": 0.7479248046875, "reward_std": 0.017642071470618248, "rewards//mean": 0.7479248046875, "rewards//std": 0.048654671758413315, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.116, "grad_norm": 5.606500148773193, "kl": 1.4434173591434956, "learning_rate": 9.720837299218818e-07, "loss": 0.1443, "num_tokens": 5013744.0, "reward": 0.75146484375, "reward_std": 0.013743579387664795, "rewards//mean": 0.75146484375, "rewards//std": 0.032626405358314514, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1162, "grad_norm": 5.018765926361084, "kl": 1.4865971878170967, "learning_rate": 9.719790845697532e-07, "loss": 0.1487, "num_tokens": 5022304.0, "reward": 0.74420166015625, "reward_std": 0.01578206568956375, "rewards//mean": 0.74420166015625, "rewards//std": 0.04040352255105972, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1164, "grad_norm": 6.59496545791626, "kl": 1.6979880444705486, "learning_rate": 9.71874249104506e-07, "loss": 0.1698, "num_tokens": 5030944.0, "reward": 0.69598388671875, "reward_std": 0.0127449631690979, "rewards//mean": 0.69598388671875, "rewards//std": 0.03594689816236496, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1166, "grad_norm": 4.271090030670166, "kl": 1.6714221462607384, "learning_rate": 9.717692235683674e-07, "loss": 0.1671, "num_tokens": 5039632.0, "reward": 0.75897216796875, "reward_std": 0.012486828491091728, "rewards//mean": 0.75897216796875, "rewards//std": 0.03377307951450348, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1168, "grad_norm": 6.605570316314697, "kl": 2.5692805107682943, "learning_rate": 9.716640080036423e-07, "loss": 0.2569, "num_tokens": 5048256.0, "reward": 0.734130859375, "reward_std": 0.01749209687113762, "rewards//mean": 0.734130859375, "rewards//std": 0.0323907844722271, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.117, "grad_norm": 4.8662238121032715, "kl": 1.6305445469915867, "learning_rate": 9.715586024527109e-07, "loss": 0.1631, "num_tokens": 5056808.0, "reward": 0.75189208984375, "reward_std": 0.012177273631095886, "rewards//mean": 0.75189208984375, "rewards//std": 0.03622502088546753, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1172, "grad_norm": 5.55615234375, "kl": 2.160937760025263, "learning_rate": 9.714530069580308e-07, "loss": 0.2161, "num_tokens": 5065400.0, "reward": 0.7667236328125, "reward_std": 0.01874316856265068, "rewards//mean": 0.7667236328125, "rewards//std": 0.03359900414943695, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1174, "grad_norm": 4.025786876678467, "kl": 2.293276358395815, "learning_rate": 9.71347221562136e-07, "loss": 0.2293, "num_tokens": 5074064.0, "reward": 0.77850341796875, "reward_std": 0.015237913466989994, "rewards//mean": 0.77850341796875, "rewards//std": 0.03395189717411995, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1176, "grad_norm": 5.526472568511963, "kl": 1.1222283877432346, "learning_rate": 9.712412463076367e-07, "loss": 0.1122, "num_tokens": 5082720.0, "reward": 0.7344970703125, "reward_std": 0.014784103259444237, "rewards//mean": 0.7344970703125, "rewards//std": 0.03695956990122795, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1178, "grad_norm": 4.615313529968262, "kl": 2.069821909070015, "learning_rate": 9.711350812372196e-07, "loss": 0.207, "num_tokens": 5091344.0, "reward": 0.73822021484375, "reward_std": 0.017122317105531693, "rewards//mean": 0.73822021484375, "rewards//std": 0.03217785060405731, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.118, "grad_norm": 3.021036148071289, "kl": 1.9491257146000862, "learning_rate": 9.710287263936483e-07, "loss": 0.1949, "num_tokens": 5100024.0, "reward": 0.7535400390625, "reward_std": 0.014991648495197296, "rewards//mean": 0.7535400390625, "rewards//std": 0.0286011453717947, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1182, "grad_norm": 5.799602031707764, "kl": 2.209879280999303, "learning_rate": 9.709221818197623e-07, "loss": 0.221, "num_tokens": 5108640.0, "reward": 0.7288818359375, "reward_std": 0.018161766231060028, "rewards//mean": 0.7288818359375, "rewards//std": 0.03411585092544556, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1184, "grad_norm": 9.670961380004883, "kl": 2.449349695816636, "learning_rate": 9.708154475584777e-07, "loss": 0.2449, "num_tokens": 5117224.0, "reward": 0.7255859375, "reward_std": 0.021229520440101624, "rewards//mean": 0.7255859375, "rewards//std": 0.05025145411491394, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1186, "grad_norm": 7.468798637390137, "kl": 2.5942456889897585, "learning_rate": 9.707085236527873e-07, "loss": 0.2594, "num_tokens": 5125776.0, "reward": 0.772216796875, "reward_std": 0.01765240915119648, "rewards//mean": 0.772216796875, "rewards//std": 0.034187883138656616, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1188, "grad_norm": 5.847729206085205, "kl": 1.6708920057862997, "learning_rate": 9.706014101457599e-07, "loss": 0.1671, "num_tokens": 5134408.0, "reward": 0.77679443359375, "reward_std": 0.020488444715738297, "rewards//mean": 0.77679443359375, "rewards//std": 0.02839844487607479, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.119, "grad_norm": 4.506982326507568, "kl": 1.6495173051953316, "learning_rate": 9.704941070805405e-07, "loss": 0.165, "num_tokens": 5143040.0, "reward": 0.75848388671875, "reward_std": 0.015759726986289024, "rewards//mean": 0.75848388671875, "rewards//std": 0.03396526724100113, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1192, "grad_norm": 5.312528610229492, "kl": 1.6204978078603745, "learning_rate": 9.70386614500351e-07, "loss": 0.162, "num_tokens": 5151704.0, "reward": 0.74822998046875, "reward_std": 0.01845605857670307, "rewards//mean": 0.74822998046875, "rewards//std": 0.03259988874197006, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1194, "grad_norm": 5.818426609039307, "kl": 1.2239194139838219, "learning_rate": 9.702789324484896e-07, "loss": 0.1224, "num_tokens": 5160400.0, "reward": 0.7462158203125, "reward_std": 0.014775009825825691, "rewards//mean": 0.7462158203125, "rewards//std": 0.03261503577232361, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1196, "grad_norm": 3.984865427017212, "kl": 1.7139388527721167, "learning_rate": 9.701710609683305e-07, "loss": 0.1714, "num_tokens": 5169024.0, "reward": 0.75494384765625, "reward_std": 0.014658878557384014, "rewards//mean": 0.75494384765625, "rewards//std": 0.036619026213884354, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1198, "grad_norm": 4.632446765899658, "kl": 2.2801280226558447, "learning_rate": 9.700630001033243e-07, "loss": 0.228, "num_tokens": 5177672.0, "reward": 0.74786376953125, "reward_std": 0.016614826396107674, "rewards//mean": 0.74786376953125, "rewards//std": 0.035276126116514206, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.12, "grad_norm": 4.269896984100342, "kl": 1.2402616143226624, "learning_rate": 9.699547498969978e-07, "loss": 0.124, "num_tokens": 5186400.0, "reward": 0.7333984375, "reward_std": 0.01491763349622488, "rewards//mean": 0.7333984375, "rewards//std": 0.04049937054514885, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1202, "grad_norm": 5.355497360229492, "kl": 1.1691132951527834, "learning_rate": 9.698463103929541e-07, "loss": 0.1169, "num_tokens": 5195048.0, "reward": 0.7392578125, "reward_std": 0.011865407228469849, "rewards//mean": 0.7392578125, "rewards//std": 0.0266120582818985, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1204, "grad_norm": 4.567091941833496, "kl": 1.8471162002533674, "learning_rate": 9.69737681634873e-07, "loss": 0.1847, "num_tokens": 5203712.0, "reward": 0.76873779296875, "reward_std": 0.013573193922638893, "rewards//mean": 0.76873779296875, "rewards//std": 0.02713640034198761, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1206, "grad_norm": 4.146628379821777, "kl": 2.1998727172613144, "learning_rate": 9.696288636665097e-07, "loss": 0.22, "num_tokens": 5212352.0, "reward": 0.75372314453125, "reward_std": 0.019229482859373093, "rewards//mean": 0.75372314453125, "rewards//std": 0.03928773105144501, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1208, "grad_norm": 10.259482383728027, "kl": 1.3427194859832525, "learning_rate": 9.695198565316964e-07, "loss": 0.1343, "num_tokens": 5220912.0, "reward": 0.7747802734375, "reward_std": 0.01869441568851471, "rewards//mean": 0.7747802734375, "rewards//std": 0.04368827864527702, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.121, "grad_norm": 4.669132232666016, "kl": 1.7059618532657623, "learning_rate": 9.69410660274341e-07, "loss": 0.1706, "num_tokens": 5229616.0, "reward": 0.75067138671875, "reward_std": 0.021110327914357185, "rewards//mean": 0.75067138671875, "rewards//std": 0.037406329065561295, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1212, "grad_norm": 3.9863343238830566, "kl": 1.500737689435482, "learning_rate": 9.693012749384277e-07, "loss": 0.1501, "num_tokens": 5238192.0, "reward": 0.72528076171875, "reward_std": 0.011379316449165344, "rewards//mean": 0.72528076171875, "rewards//std": 0.0314127616584301, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1214, "grad_norm": 5.863799571990967, "kl": 1.294170867651701, "learning_rate": 9.691917005680173e-07, "loss": 0.1294, "num_tokens": 5246720.0, "reward": 0.7669677734375, "reward_std": 0.013313330709934235, "rewards//mean": 0.7669677734375, "rewards//std": 0.034154877066612244, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1216, "grad_norm": 4.813351154327393, "kl": 1.274811888113618, "learning_rate": 9.690819372072456e-07, "loss": 0.1275, "num_tokens": 5255328.0, "reward": 0.7459716796875, "reward_std": 0.014467386528849602, "rewards//mean": 0.7459716796875, "rewards//std": 0.03138124197721481, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1218, "grad_norm": 4.7343244552612305, "kl": 1.8196150474250317, "learning_rate": 9.68971984900326e-07, "loss": 0.182, "num_tokens": 5263992.0, "reward": 0.73419189453125, "reward_std": 0.014000032097101212, "rewards//mean": 0.73419189453125, "rewards//std": 0.041488584131002426, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.122, "grad_norm": 5.491872310638428, "kl": 2.081124259158969, "learning_rate": 9.688618436915468e-07, "loss": 0.2081, "num_tokens": 5272632.0, "reward": 0.7449951171875, "reward_std": 0.013375763781368732, "rewards//mean": 0.7449951171875, "rewards//std": 0.033642228692770004, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1222, "grad_norm": 4.709848880767822, "kl": 1.860938437283039, "learning_rate": 9.68751513625273e-07, "loss": 0.1861, "num_tokens": 5281264.0, "reward": 0.758056640625, "reward_std": 0.012330969795584679, "rewards//mean": 0.758056640625, "rewards//std": 0.029950950294733047, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1224, "grad_norm": 5.294053077697754, "kl": 1.5603632759302855, "learning_rate": 9.686409947459457e-07, "loss": 0.156, "num_tokens": 5290096.0, "reward": 0.72772216796875, "reward_std": 0.019234199076890945, "rewards//mean": 0.72772216796875, "rewards//std": 0.039900969713926315, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1226, "grad_norm": 4.610052108764648, "kl": 1.8261591251939535, "learning_rate": 9.685302870980817e-07, "loss": 0.1826, "num_tokens": 5298720.0, "reward": 0.7579345703125, "reward_std": 0.020076729357242584, "rewards//mean": 0.7579345703125, "rewards//std": 0.03839943930506706, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1228, "grad_norm": 5.603288173675537, "kl": 2.471636150032282, "learning_rate": 9.684193907262742e-07, "loss": 0.2472, "num_tokens": 5307344.0, "reward": 0.726318359375, "reward_std": 0.020474456250667572, "rewards//mean": 0.726318359375, "rewards//std": 0.042128413915634155, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.123, "grad_norm": 8.127079010009766, "kl": 1.6221202835440636, "learning_rate": 9.68308305675192e-07, "loss": 0.1622, "num_tokens": 5315880.0, "reward": 0.764404296875, "reward_std": 0.018329406157135963, "rewards//mean": 0.764404296875, "rewards//std": 0.037109375, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1232, "grad_norm": 4.38250732421875, "kl": 1.0610403437167406, "learning_rate": 9.681970319895802e-07, "loss": 0.1061, "num_tokens": 5324592.0, "reward": 0.763427734375, "reward_std": 0.014990486204624176, "rewards//mean": 0.763427734375, "rewards//std": 0.03242067992687225, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1234, "grad_norm": 7.516284465789795, "kl": 1.3677379209548235, "learning_rate": 9.6808556971426e-07, "loss": 0.1368, "num_tokens": 5333288.0, "reward": 0.75140380859375, "reward_std": 0.012255040928721428, "rewards//mean": 0.75140380859375, "rewards//std": 0.026633594185113907, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1236, "grad_norm": 5.273319721221924, "kl": 1.7700269967317581, "learning_rate": 9.679739188941283e-07, "loss": 0.177, "num_tokens": 5341944.0, "reward": 0.73919677734375, "reward_std": 0.011965281330049038, "rewards//mean": 0.73919677734375, "rewards//std": 0.030803028494119644, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1238, "grad_norm": 5.451859474182129, "kl": 1.7796277161687613, "learning_rate": 9.678620795741582e-07, "loss": 0.178, "num_tokens": 5350712.0, "reward": 0.7913818359375, "reward_std": 0.018562760204076767, "rewards//mean": 0.7913818359375, "rewards//std": 0.03736203908920288, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.124, "grad_norm": 4.603490829467773, "kl": 1.4915839675813913, "learning_rate": 9.677500517993982e-07, "loss": 0.1492, "num_tokens": 5359400.0, "reward": 0.7509765625, "reward_std": 0.015333266928792, "rewards//mean": 0.7509765625, "rewards//std": 0.03739625960588455, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1242, "grad_norm": 4.567759990692139, "kl": 2.169027430936694, "learning_rate": 9.676378356149732e-07, "loss": 0.2169, "num_tokens": 5368040.0, "reward": 0.739990234375, "reward_std": 0.010931908152997494, "rewards//mean": 0.739990234375, "rewards//std": 0.04056435450911522, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1244, "grad_norm": 3.2500686645507812, "kl": 1.6362348943948746, "learning_rate": 9.675254310660841e-07, "loss": 0.1636, "num_tokens": 5376656.0, "reward": 0.73065185546875, "reward_std": 0.01525677926838398, "rewards//mean": 0.73065185546875, "rewards//std": 0.04145354405045509, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1246, "grad_norm": 3.7679014205932617, "kl": 1.2271438892930746, "learning_rate": 9.674128381980071e-07, "loss": 0.1227, "num_tokens": 5385312.0, "reward": 0.74261474609375, "reward_std": 0.00787612609565258, "rewards//mean": 0.74261474609375, "rewards//std": 0.027613617479801178, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1248, "grad_norm": 2.993044853210449, "kl": 1.6120636332780123, "learning_rate": 9.67300057056095e-07, "loss": 0.1612, "num_tokens": 5393888.0, "reward": 0.75115966796875, "reward_std": 0.011264875531196594, "rewards//mean": 0.75115966796875, "rewards//std": 0.03326001018285751, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.125, "grad_norm": 4.802676200866699, "kl": 2.1080329287797213, "learning_rate": 9.671870876857758e-07, "loss": 0.2108, "num_tokens": 5402496.0, "reward": 0.752197265625, "reward_std": 0.016877135261893272, "rewards//mean": 0.752197265625, "rewards//std": 0.04259154945611954, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1252, "grad_norm": 4.552894592285156, "kl": 1.7008184995502234, "learning_rate": 9.670739301325534e-07, "loss": 0.1701, "num_tokens": 5411160.0, "reward": 0.75225830078125, "reward_std": 0.014657140709459782, "rewards//mean": 0.75225830078125, "rewards//std": 0.04453704133629799, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1254, "grad_norm": 9.11898422241211, "kl": 1.379117302596569, "learning_rate": 9.669605844420078e-07, "loss": 0.1379, "num_tokens": 5419800.0, "reward": 0.785400390625, "reward_std": 0.017020780593156815, "rewards//mean": 0.785400390625, "rewards//std": 0.028863780200481415, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1256, "grad_norm": 5.11296272277832, "kl": 2.194600412622094, "learning_rate": 9.668470506597946e-07, "loss": 0.2195, "num_tokens": 5428536.0, "reward": 0.73187255859375, "reward_std": 0.018063906580209732, "rewards//mean": 0.73187255859375, "rewards//std": 0.03468368574976921, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1258, "grad_norm": 4.903803825378418, "kl": 2.3288608007133007, "learning_rate": 9.667333288316453e-07, "loss": 0.2329, "num_tokens": 5437144.0, "reward": 0.78515625, "reward_std": 0.016290197148919106, "rewards//mean": 0.78515625, "rewards//std": 0.03326234221458435, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.126, "grad_norm": 16.58167839050293, "kl": 3.676839765161276, "learning_rate": 9.66619419003367e-07, "loss": 0.3677, "num_tokens": 5445856.0, "reward": 0.74432373046875, "reward_std": 0.01929108425974846, "rewards//mean": 0.74432373046875, "rewards//std": 0.051831282675266266, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1262, "grad_norm": 5.5465989112854, "kl": 2.333956880494952, "learning_rate": 9.665053212208426e-07, "loss": 0.2334, "num_tokens": 5454512.0, "reward": 0.72186279296875, "reward_std": 0.017179621383547783, "rewards//mean": 0.72186279296875, "rewards//std": 0.042921826243400574, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1264, "grad_norm": 5.082263469696045, "kl": 1.5779585037380457, "learning_rate": 9.663910355300304e-07, "loss": 0.1578, "num_tokens": 5463144.0, "reward": 0.75909423828125, "reward_std": 0.018127642571926117, "rewards//mean": 0.75909423828125, "rewards//std": 0.03502202779054642, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1266, "grad_norm": 5.518665313720703, "kl": 1.1804874017834663, "learning_rate": 9.66276561976965e-07, "loss": 0.118, "num_tokens": 5471768.0, "reward": 0.74847412109375, "reward_std": 0.01492733508348465, "rewards//mean": 0.74847412109375, "rewards//std": 0.028540944680571556, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1268, "grad_norm": 3.527820110321045, "kl": 2.0510126557201147, "learning_rate": 9.661619006077561e-07, "loss": 0.2051, "num_tokens": 5480384.0, "reward": 0.73809814453125, "reward_std": 0.014803184196352959, "rewards//mean": 0.73809814453125, "rewards//std": 0.026779266074299812, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.127, "grad_norm": 4.203972339630127, "kl": 2.5589918410405517, "learning_rate": 9.660470514685895e-07, "loss": 0.2559, "num_tokens": 5488984.0, "reward": 0.72760009765625, "reward_std": 0.018420351669192314, "rewards//mean": 0.72760009765625, "rewards//std": 0.03252783417701721, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1272, "grad_norm": 7.727021217346191, "kl": 2.554689183831215, "learning_rate": 9.659320146057262e-07, "loss": 0.2555, "num_tokens": 5497624.0, "reward": 0.80303955078125, "reward_std": 0.020463142544031143, "rewards//mean": 0.80303955078125, "rewards//std": 0.04383542016148567, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1274, "grad_norm": 7.647414207458496, "kl": 2.474998451769352, "learning_rate": 9.65816790065503e-07, "loss": 0.2475, "num_tokens": 5506312.0, "reward": 0.730712890625, "reward_std": 0.02033475786447525, "rewards//mean": 0.730712890625, "rewards//std": 0.03804398328065872, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1276, "grad_norm": 4.044209957122803, "kl": 1.9211649019271135, "learning_rate": 9.657013778943327e-07, "loss": 0.1921, "num_tokens": 5514912.0, "reward": 0.73089599609375, "reward_std": 0.01036878488957882, "rewards//mean": 0.73089599609375, "rewards//std": 0.01902388222515583, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1278, "grad_norm": 10.105807304382324, "kl": 1.2389811612665653, "learning_rate": 9.65585778138703e-07, "loss": 0.1239, "num_tokens": 5523536.0, "reward": 0.7843017578125, "reward_std": 0.011940107680857182, "rewards//mean": 0.7843017578125, "rewards//std": 0.026761524379253387, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.128, "grad_norm": 4.215726375579834, "kl": 2.140462227165699, "learning_rate": 9.654699908451776e-07, "loss": 0.214, "num_tokens": 5532184.0, "reward": 0.7275390625, "reward_std": 0.019287630915641785, "rewards//mean": 0.7275390625, "rewards//std": 0.034492045640945435, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1282, "grad_norm": 7.801215171813965, "kl": 2.9723503328859806, "learning_rate": 9.653540160603955e-07, "loss": 0.2972, "num_tokens": 5540808.0, "reward": 0.75579833984375, "reward_std": 0.021624911576509476, "rewards//mean": 0.75579833984375, "rewards//std": 0.04161427170038223, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1284, "grad_norm": 10.198945045471191, "kl": 2.7207969166338444, "learning_rate": 9.652378538310713e-07, "loss": 0.2721, "num_tokens": 5549448.0, "reward": 0.7222900390625, "reward_std": 0.01807771623134613, "rewards//mean": 0.7222900390625, "rewards//std": 0.04192002862691879, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1286, "grad_norm": 4.129467964172363, "kl": 1.4408370926976204, "learning_rate": 9.651215042039953e-07, "loss": 0.1441, "num_tokens": 5558080.0, "reward": 0.78790283203125, "reward_std": 0.01506801974028349, "rewards//mean": 0.78790283203125, "rewards//std": 0.029104484245181084, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1288, "grad_norm": 5.908904552459717, "kl": 1.435167744755745, "learning_rate": 9.650049672260333e-07, "loss": 0.1435, "num_tokens": 5566712.0, "reward": 0.7703857421875, "reward_std": 0.012270634062588215, "rewards//mean": 0.7703857421875, "rewards//std": 0.03359900414943695, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.129, "grad_norm": 9.64582633972168, "kl": 2.590956222265959, "learning_rate": 9.648882429441256e-07, "loss": 0.2591, "num_tokens": 5575304.0, "reward": 0.76251220703125, "reward_std": 0.019254688173532486, "rewards//mean": 0.76251220703125, "rewards//std": 0.03916887938976288, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1292, "grad_norm": 10.265853881835938, "kl": 2.0684807300567627, "learning_rate": 9.647713314052895e-07, "loss": 0.2068, "num_tokens": 5584080.0, "reward": 0.75274658203125, "reward_std": 0.016576694324612617, "rewards//mean": 0.75274658203125, "rewards//std": 0.03929543495178223, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1294, "grad_norm": 6.955067157745361, "kl": 2.5854177810251713, "learning_rate": 9.646542326566168e-07, "loss": 0.2585, "num_tokens": 5592720.0, "reward": 0.7518310546875, "reward_std": 0.018285535275936127, "rewards//mean": 0.7518310546875, "rewards//std": 0.034582991153001785, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1296, "grad_norm": 3.627410411834717, "kl": 1.9893319997936487, "learning_rate": 9.645369467452745e-07, "loss": 0.1989, "num_tokens": 5601432.0, "reward": 0.76531982421875, "reward_std": 0.01493283361196518, "rewards//mean": 0.76531982421875, "rewards//std": 0.022242441773414612, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1298, "grad_norm": 4.492605209350586, "kl": 1.9629135336726904, "learning_rate": 9.644194737185057e-07, "loss": 0.1963, "num_tokens": 5610040.0, "reward": 0.7099609375, "reward_std": 0.014292044565081596, "rewards//mean": 0.7099609375, "rewards//std": 0.02810811810195446, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.13, "grad_norm": 10.583423614501953, "kl": 1.5659422241151333, "learning_rate": 9.643018136236286e-07, "loss": 0.1566, "num_tokens": 5618752.0, "reward": 0.733642578125, "reward_std": 0.013740007765591145, "rewards//mean": 0.733642578125, "rewards//std": 0.03085121139883995, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1302, "grad_norm": 6.3281683921813965, "kl": 1.6083077788352966, "learning_rate": 9.641839665080363e-07, "loss": 0.1608, "num_tokens": 5627320.0, "reward": 0.7452392578125, "reward_std": 0.012915275990962982, "rewards//mean": 0.7452392578125, "rewards//std": 0.02484673634171486, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1304, "grad_norm": 5.759901523590088, "kl": 1.907441422343254, "learning_rate": 9.640659324191978e-07, "loss": 0.1907, "num_tokens": 5635952.0, "reward": 0.76611328125, "reward_std": 0.01220618188381195, "rewards//mean": 0.76611328125, "rewards//std": 0.03522507846355438, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1306, "grad_norm": 15.900641441345215, "kl": 1.3867097087204456, "learning_rate": 9.639477114046572e-07, "loss": 0.1387, "num_tokens": 5644616.0, "reward": 0.74920654296875, "reward_std": 0.009857980534434319, "rewards//mean": 0.74920654296875, "rewards//std": 0.03010665625333786, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1308, "grad_norm": 4.807596683502197, "kl": 1.720539940521121, "learning_rate": 9.63829303512034e-07, "loss": 0.1721, "num_tokens": 5653272.0, "reward": 0.7734375, "reward_std": 0.018061885610222816, "rewards//mean": 0.7734375, "rewards//std": 0.034400638192892075, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.131, "grad_norm": 8.267892837524414, "kl": 1.309445545077324, "learning_rate": 9.63710708789023e-07, "loss": 0.1309, "num_tokens": 5661888.0, "reward": 0.78753662109375, "reward_std": 0.01528235524892807, "rewards//mean": 0.78753662109375, "rewards//std": 0.02812204882502556, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1312, "grad_norm": 9.04378604888916, "kl": 2.7586640175431967, "learning_rate": 9.635919272833937e-07, "loss": 0.2759, "num_tokens": 5670504.0, "reward": 0.72735595703125, "reward_std": 0.018271243199706078, "rewards//mean": 0.72735595703125, "rewards//std": 0.039427731186151505, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1314, "grad_norm": 4.136282920837402, "kl": 2.15822652541101, "learning_rate": 9.634729590429916e-07, "loss": 0.2158, "num_tokens": 5679168.0, "reward": 0.76129150390625, "reward_std": 0.015018263831734657, "rewards//mean": 0.76129150390625, "rewards//std": 0.03659835085272789, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1316, "grad_norm": 6.392247676849365, "kl": 1.6154142674058676, "learning_rate": 9.63353804115737e-07, "loss": 0.1615, "num_tokens": 5687896.0, "reward": 0.77197265625, "reward_std": 0.018114907667040825, "rewards//mean": 0.77197265625, "rewards//std": 0.033476460725069046, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1318, "grad_norm": 6.043969631195068, "kl": 2.5147317461669445, "learning_rate": 9.632344625496255e-07, "loss": 0.2515, "num_tokens": 5696520.0, "reward": 0.7630615234375, "reward_std": 0.02536243014037609, "rewards//mean": 0.7630615234375, "rewards//std": 0.03221340849995613, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.132, "grad_norm": 6.0581955909729, "kl": 1.5347889047116041, "learning_rate": 9.63114934392728e-07, "loss": 0.1535, "num_tokens": 5705160.0, "reward": 0.71990966796875, "reward_std": 0.010831142775714397, "rewards//mean": 0.71990966796875, "rewards//std": 0.026817677542567253, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1322, "grad_norm": 6.137468338012695, "kl": 1.5315412282943726, "learning_rate": 9.6299521969319e-07, "loss": 0.1532, "num_tokens": 5713848.0, "reward": 0.77978515625, "reward_std": 0.008931154385209084, "rewards//mean": 0.77978515625, "rewards//std": 0.023359866812825203, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1324, "grad_norm": 6.5758233070373535, "kl": 2.190411478281021, "learning_rate": 9.628753184992333e-07, "loss": 0.219, "num_tokens": 5722440.0, "reward": 0.770263671875, "reward_std": 0.02048276737332344, "rewards//mean": 0.770263671875, "rewards//std": 0.03782690688967705, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1326, "grad_norm": 6.5012736320495605, "kl": 1.871248772367835, "learning_rate": 9.627552308591533e-07, "loss": 0.1871, "num_tokens": 5731056.0, "reward": 0.77105712890625, "reward_std": 0.00970059260725975, "rewards//mean": 0.77105712890625, "rewards//std": 0.026508256793022156, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1328, "grad_norm": 8.617633819580078, "kl": 2.442201526835561, "learning_rate": 9.62634956821322e-07, "loss": 0.2442, "num_tokens": 5739704.0, "reward": 0.7359619140625, "reward_std": 0.015514541417360306, "rewards//mean": 0.7359619140625, "rewards//std": 0.04491564258933067, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.133, "grad_norm": 5.606274604797363, "kl": 1.9821086134761572, "learning_rate": 9.625144964341852e-07, "loss": 0.1982, "num_tokens": 5748272.0, "reward": 0.75164794921875, "reward_std": 0.012744968757033348, "rewards//mean": 0.75164794921875, "rewards//std": 0.02847456932067871, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1332, "grad_norm": 8.139525413513184, "kl": 1.9455420151352882, "learning_rate": 9.623938497462645e-07, "loss": 0.1946, "num_tokens": 5756944.0, "reward": 0.77069091796875, "reward_std": 0.011618856340646744, "rewards//mean": 0.77069091796875, "rewards//std": 0.02731875702738762, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1334, "grad_norm": 10.729142189025879, "kl": 2.0259072836488485, "learning_rate": 9.622730168061567e-07, "loss": 0.2026, "num_tokens": 5765464.0, "reward": 0.743896484375, "reward_std": 0.016371024772524834, "rewards//mean": 0.743896484375, "rewards//std": 0.039092715829610825, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1336, "grad_norm": 5.993306636810303, "kl": 2.1834686268121004, "learning_rate": 9.621519976625326e-07, "loss": 0.2183, "num_tokens": 5774152.0, "reward": 0.75738525390625, "reward_std": 0.02642243169248104, "rewards//mean": 0.75738525390625, "rewards//std": 0.04792817682027817, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1338, "grad_norm": 9.258829116821289, "kl": 2.034265171736479, "learning_rate": 9.620307923641392e-07, "loss": 0.2034, "num_tokens": 5782856.0, "reward": 0.75311279296875, "reward_std": 0.011320114135742188, "rewards//mean": 0.75311279296875, "rewards//std": 0.03563985973596573, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.134, "grad_norm": 10.145259857177734, "kl": 3.0949180014431477, "learning_rate": 9.61909400959798e-07, "loss": 0.3095, "num_tokens": 5791512.0, "reward": 0.7398681640625, "reward_std": 0.022302493453025818, "rewards//mean": 0.7398681640625, "rewards//std": 0.04281752556562424, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1342, "grad_norm": 7.549869060516357, "kl": 3.029162682592869, "learning_rate": 9.617878234984054e-07, "loss": 0.3029, "num_tokens": 5800024.0, "reward": 0.7257080078125, "reward_std": 0.019062813371419907, "rewards//mean": 0.7257080078125, "rewards//std": 0.028463203459978104, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1344, "grad_norm": 3.5927047729492188, "kl": 2.5494485460221767, "learning_rate": 9.616660600289327e-07, "loss": 0.2549, "num_tokens": 5808632.0, "reward": 0.74658203125, "reward_std": 0.01976936310529709, "rewards//mean": 0.74658203125, "rewards//std": 0.03476830944418907, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1346, "grad_norm": 5.675290584564209, "kl": 2.3224766980856657, "learning_rate": 9.615441106004262e-07, "loss": 0.2322, "num_tokens": 5817208.0, "reward": 0.7484130859375, "reward_std": 0.026001352816820145, "rewards//mean": 0.7484130859375, "rewards//std": 0.04079562425613403, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1348, "grad_norm": 9.404773712158203, "kl": 2.733437206596136, "learning_rate": 9.614219752620072e-07, "loss": 0.2733, "num_tokens": 5826000.0, "reward": 0.7508544921875, "reward_std": 0.016492925584316254, "rewards//mean": 0.7508544921875, "rewards//std": 0.03719799220561981, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.135, "grad_norm": 15.974115371704102, "kl": 3.1622733511030674, "learning_rate": 9.612996540628717e-07, "loss": 0.3162, "num_tokens": 5834584.0, "reward": 0.74810791015625, "reward_std": 0.021986238658428192, "rewards//mean": 0.74810791015625, "rewards//std": 0.04708118736743927, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1352, "grad_norm": 13.984763145446777, "kl": 3.1775437872856855, "learning_rate": 9.611771470522907e-07, "loss": 0.3178, "num_tokens": 5843224.0, "reward": 0.7183837890625, "reward_std": 0.02330601029098034, "rewards//mean": 0.7183837890625, "rewards//std": 0.03680689260363579, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1354, "grad_norm": 12.465259552001953, "kl": 2.977510152384639, "learning_rate": 9.6105445427961e-07, "loss": 0.2978, "num_tokens": 5851800.0, "reward": 0.6907958984375, "reward_std": 0.024164468050003052, "rewards//mean": 0.6907958984375, "rewards//std": 0.039776578545570374, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1356, "grad_norm": 7.052098751068115, "kl": 2.7345681935548782, "learning_rate": 9.609315757942502e-07, "loss": 0.2735, "num_tokens": 5860408.0, "reward": 0.73309326171875, "reward_std": 0.022397270426154137, "rewards//mean": 0.73309326171875, "rewards//std": 0.039469942450523376, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1358, "grad_norm": 6.85525369644165, "kl": 1.7807729430496693, "learning_rate": 9.608085116457068e-07, "loss": 0.1781, "num_tokens": 5869048.0, "reward": 0.75848388671875, "reward_std": 0.010272054001688957, "rewards//mean": 0.75848388671875, "rewards//std": 0.035156626254320145, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.136, "grad_norm": 6.290085792541504, "kl": 2.1790910735726357, "learning_rate": 9.606852618835502e-07, "loss": 0.2179, "num_tokens": 5877704.0, "reward": 0.75604248046875, "reward_std": 0.01704718917608261, "rewards//mean": 0.75604248046875, "rewards//std": 0.043335992842912674, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1362, "grad_norm": 4.2021484375, "kl": 2.2223317623138428, "learning_rate": 9.60561826557425e-07, "loss": 0.2222, "num_tokens": 5886232.0, "reward": 0.74078369140625, "reward_std": 0.021687496453523636, "rewards//mean": 0.74078369140625, "rewards//std": 0.035477664321660995, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1364, "grad_norm": 5.2405476570129395, "kl": 2.203371226787567, "learning_rate": 9.604382057170512e-07, "loss": 0.2203, "num_tokens": 5894872.0, "reward": 0.72686767578125, "reward_std": 0.01516915112733841, "rewards//mean": 0.72686767578125, "rewards//std": 0.03954504057765007, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1366, "grad_norm": 7.295520305633545, "kl": 1.8267599921673536, "learning_rate": 9.603143994122232e-07, "loss": 0.1827, "num_tokens": 5903480.0, "reward": 0.720703125, "reward_std": 0.016644855961203575, "rewards//mean": 0.720703125, "rewards//std": 0.03944505378603935, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1368, "grad_norm": 5.629857063293457, "kl": 1.5262629892677069, "learning_rate": 9.601904076928102e-07, "loss": 0.1526, "num_tokens": 5912088.0, "reward": 0.748046875, "reward_std": 0.01910483092069626, "rewards//mean": 0.748046875, "rewards//std": 0.03967159241437912, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.137, "grad_norm": 9.598852157592773, "kl": 1.8031053133308887, "learning_rate": 9.60066230608756e-07, "loss": 0.1803, "num_tokens": 5920760.0, "reward": 0.776123046875, "reward_std": 0.011279763653874397, "rewards//mean": 0.776123046875, "rewards//std": 0.02671085111796856, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1372, "grad_norm": 5.456700801849365, "kl": 2.1518442891538143, "learning_rate": 9.599418682100792e-07, "loss": 0.2152, "num_tokens": 5929320.0, "reward": 0.7392578125, "reward_std": 0.02009446546435356, "rewards//mean": 0.7392578125, "rewards//std": 0.03820360451936722, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1374, "grad_norm": 6.295849323272705, "kl": 1.056298403069377, "learning_rate": 9.598173205468727e-07, "loss": 0.1056, "num_tokens": 5937912.0, "reward": 0.73492431640625, "reward_std": 0.01417911984026432, "rewards//mean": 0.73492431640625, "rewards//std": 0.03890012949705124, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1376, "grad_norm": 7.775590419769287, "kl": 1.8858890049159527, "learning_rate": 9.596925876693047e-07, "loss": 0.1886, "num_tokens": 5946624.0, "reward": 0.72491455078125, "reward_std": 0.017337076365947723, "rewards//mean": 0.72491455078125, "rewards//std": 0.03598225489258766, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1378, "grad_norm": 5.867929458618164, "kl": 1.2759409677237272, "learning_rate": 9.595676696276171e-07, "loss": 0.1276, "num_tokens": 5955192.0, "reward": 0.713134765625, "reward_std": 0.013202743604779243, "rewards//mean": 0.713134765625, "rewards//std": 0.04182259738445282, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.138, "grad_norm": 6.387970924377441, "kl": 1.318649284541607, "learning_rate": 9.594425664721274e-07, "loss": 0.1319, "num_tokens": 5963760.0, "reward": 0.76019287109375, "reward_std": 0.01943659968674183, "rewards//mean": 0.76019287109375, "rewards//std": 0.033113591372966766, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1382, "grad_norm": 5.6375651359558105, "kl": 1.0852784998714924, "learning_rate": 9.593172782532267e-07, "loss": 0.1085, "num_tokens": 5972352.0, "reward": 0.74737548828125, "reward_std": 0.013980841264128685, "rewards//mean": 0.74737548828125, "rewards//std": 0.03712359815835953, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1384, "grad_norm": 10.946083068847656, "kl": 1.2588221821933985, "learning_rate": 9.591918050213813e-07, "loss": 0.1259, "num_tokens": 5980920.0, "reward": 0.748291015625, "reward_std": 0.01414998434484005, "rewards//mean": 0.748291015625, "rewards//std": 0.030432282015681267, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1386, "grad_norm": 16.50353240966797, "kl": 1.2514857109636068, "learning_rate": 9.590661468271318e-07, "loss": 0.1251, "num_tokens": 5989576.0, "reward": 0.75640869140625, "reward_std": 0.02333802357316017, "rewards//mean": 0.75640869140625, "rewards//std": 0.045641250908374786, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1388, "grad_norm": 9.173646926879883, "kl": 1.1957140490412712, "learning_rate": 9.589403037210931e-07, "loss": 0.1196, "num_tokens": 5998160.0, "reward": 0.7657470703125, "reward_std": 0.020367193967103958, "rewards//mean": 0.7657470703125, "rewards//std": 0.04024120047688484, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.139, "grad_norm": 5.923418998718262, "kl": 1.862734381109476, "learning_rate": 9.58814275753955e-07, "loss": 0.1863, "num_tokens": 6006776.0, "reward": 0.74822998046875, "reward_std": 0.021520305424928665, "rewards//mean": 0.74822998046875, "rewards//std": 0.04513195529580116, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1392, "grad_norm": 7.198389530181885, "kl": 1.7558288034051657, "learning_rate": 9.586880629764817e-07, "loss": 0.1756, "num_tokens": 6015376.0, "reward": 0.767578125, "reward_std": 0.019421234726905823, "rewards//mean": 0.767578125, "rewards//std": 0.03533834591507912, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1394, "grad_norm": 4.785677433013916, "kl": 2.1740710642188787, "learning_rate": 9.585616654395112e-07, "loss": 0.2174, "num_tokens": 6023976.0, "reward": 0.74737548828125, "reward_std": 0.024873943999409676, "rewards//mean": 0.74737548828125, "rewards//std": 0.04465245455503464, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1396, "grad_norm": 4.419807434082031, "kl": 1.7555591221898794, "learning_rate": 9.584350831939569e-07, "loss": 0.1756, "num_tokens": 6032584.0, "reward": 0.77490234375, "reward_std": 0.022312358021736145, "rewards//mean": 0.77490234375, "rewards//std": 0.03846580535173416, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1398, "grad_norm": 5.374952793121338, "kl": 2.5095388777554035, "learning_rate": 9.58308316290806e-07, "loss": 0.251, "num_tokens": 6041200.0, "reward": 0.7264404296875, "reward_std": 0.016221458092331886, "rewards//mean": 0.7264404296875, "rewards//std": 0.04364251717925072, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.14, "grad_norm": 20.355453491210938, "kl": 2.0094070453196764, "learning_rate": 9.581813647811197e-07, "loss": 0.2009, "num_tokens": 6049776.0, "reward": 0.7491455078125, "reward_std": 0.015299257822334766, "rewards//mean": 0.7491455078125, "rewards//std": 0.04100732132792473, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1402, "grad_norm": 8.70608901977539, "kl": 2.6194078754633665, "learning_rate": 9.580542287160346e-07, "loss": 0.2619, "num_tokens": 6058352.0, "reward": 0.75726318359375, "reward_std": 0.01727396994829178, "rewards//mean": 0.75726318359375, "rewards//std": 0.04509403929114342, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1404, "grad_norm": 5.559673309326172, "kl": 2.009598921984434, "learning_rate": 9.579269081467613e-07, "loss": 0.201, "num_tokens": 6066912.0, "reward": 0.73590087890625, "reward_std": 0.01806057244539261, "rewards//mean": 0.73590087890625, "rewards//std": 0.042766012251377106, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1406, "grad_norm": 7.46162748336792, "kl": 2.4945561960339546, "learning_rate": 9.57799403124584e-07, "loss": 0.2495, "num_tokens": 6075536.0, "reward": 0.74627685546875, "reward_std": 0.025955861434340477, "rewards//mean": 0.74627685546875, "rewards//std": 0.04330524057149887, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1408, "grad_norm": 5.260041236877441, "kl": 2.1661485619843006, "learning_rate": 9.576717137008617e-07, "loss": 0.2166, "num_tokens": 6084152.0, "reward": 0.7259521484375, "reward_std": 0.018186451867222786, "rewards//mean": 0.7259521484375, "rewards//std": 0.046793390065431595, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.141, "grad_norm": 5.460837364196777, "kl": 2.0772647876292467, "learning_rate": 9.575438399270278e-07, "loss": 0.2077, "num_tokens": 6092728.0, "reward": 0.77392578125, "reward_std": 0.025246405974030495, "rewards//mean": 0.77392578125, "rewards//std": 0.03738654404878616, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1412, "grad_norm": 5.491000175476074, "kl": 1.6645305138081312, "learning_rate": 9.5741578185459e-07, "loss": 0.1665, "num_tokens": 6101480.0, "reward": 0.71630859375, "reward_std": 0.01862640306353569, "rewards//mean": 0.71630859375, "rewards//std": 0.052331533282995224, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1414, "grad_norm": 4.078625679016113, "kl": 2.264868099242449, "learning_rate": 9.572875395351301e-07, "loss": 0.2265, "num_tokens": 6110208.0, "reward": 0.72979736328125, "reward_std": 0.02179661951959133, "rewards//mean": 0.72979736328125, "rewards//std": 0.04679512605071068, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1416, "grad_norm": 4.522886753082275, "kl": 1.8940012231469154, "learning_rate": 9.571591130203037e-07, "loss": 0.1894, "num_tokens": 6118832.0, "reward": 0.736328125, "reward_std": 0.016552813351154327, "rewards//mean": 0.736328125, "rewards//std": 0.040132906287908554, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1418, "grad_norm": 14.472113609313965, "kl": 1.6828726809471846, "learning_rate": 9.570305023618415e-07, "loss": 0.1683, "num_tokens": 6127480.0, "reward": 0.75146484375, "reward_std": 0.020189717411994934, "rewards//mean": 0.75146484375, "rewards//std": 0.04437193274497986, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.142, "grad_norm": 4.928863048553467, "kl": 1.7356335557997227, "learning_rate": 9.569017076115475e-07, "loss": 0.1736, "num_tokens": 6136016.0, "reward": 0.7225341796875, "reward_std": 0.024343695491552353, "rewards//mean": 0.7225341796875, "rewards//std": 0.044210441410541534, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1422, "grad_norm": 8.598285675048828, "kl": 2.318613374605775, "learning_rate": 9.567727288213004e-07, "loss": 0.2319, "num_tokens": 6144688.0, "reward": 0.74017333984375, "reward_std": 0.018025681376457214, "rewards//mean": 0.74017333984375, "rewards//std": 0.029098762199282646, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1424, "grad_norm": 3.940495014190674, "kl": 2.2707323767244816, "learning_rate": 9.566435660430527e-07, "loss": 0.2271, "num_tokens": 6153288.0, "reward": 0.74481201171875, "reward_std": 0.02541990950703621, "rewards//mean": 0.74481201171875, "rewards//std": 0.04780135676264763, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1426, "grad_norm": 6.630128383636475, "kl": 1.3487122803926468, "learning_rate": 9.565142193288312e-07, "loss": 0.1349, "num_tokens": 6161936.0, "reward": 0.7525634765625, "reward_std": 0.02380307763814926, "rewards//mean": 0.7525634765625, "rewards//std": 0.04971463978290558, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1428, "grad_norm": 7.120047092437744, "kl": 1.6414924841374159, "learning_rate": 9.563846887307368e-07, "loss": 0.1641, "num_tokens": 6170696.0, "reward": 0.76116943359375, "reward_std": 0.017113400623202324, "rewards//mean": 0.76116943359375, "rewards//std": 0.03396749868988991, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.143, "grad_norm": 6.949481010437012, "kl": 1.45215680077672, "learning_rate": 9.562549743009442e-07, "loss": 0.1452, "num_tokens": 6179384.0, "reward": 0.77239990234375, "reward_std": 0.019084131345152855, "rewards//mean": 0.77239990234375, "rewards//std": 0.038995739072561264, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1432, "grad_norm": 6.028757572174072, "kl": 2.0199192948639393, "learning_rate": 9.561250760917025e-07, "loss": 0.202, "num_tokens": 6187896.0, "reward": 0.75274658203125, "reward_std": 0.017159592360258102, "rewards//mean": 0.75274658203125, "rewards//std": 0.038004521280527115, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1434, "grad_norm": 4.979818820953369, "kl": 1.553580492734909, "learning_rate": 9.55994994155335e-07, "loss": 0.1554, "num_tokens": 6196536.0, "reward": 0.7301025390625, "reward_std": 0.01808983087539673, "rewards//mean": 0.7301025390625, "rewards//std": 0.03162534162402153, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1436, "grad_norm": 5.092225551605225, "kl": 1.3015301302075386, "learning_rate": 9.558647285442381e-07, "loss": 0.1302, "num_tokens": 6205168.0, "reward": 0.74774169921875, "reward_std": 0.01389441266655922, "rewards//mean": 0.74774169921875, "rewards//std": 0.029461177065968513, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1438, "grad_norm": 8.318163871765137, "kl": 2.202155739068985, "learning_rate": 9.55734279310883e-07, "loss": 0.2202, "num_tokens": 6213784.0, "reward": 0.7529296875, "reward_std": 0.017229489982128143, "rewards//mean": 0.7529296875, "rewards//std": 0.03742862865328789, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.144, "grad_norm": 8.821160316467285, "kl": 2.473429564386606, "learning_rate": 9.55603646507815e-07, "loss": 0.2473, "num_tokens": 6222432.0, "reward": 0.74346923828125, "reward_std": 0.018832771107554436, "rewards//mean": 0.74346923828125, "rewards//std": 0.03762904927134514, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1442, "grad_norm": 17.328784942626953, "kl": 1.1039512548595667, "learning_rate": 9.554728301876524e-07, "loss": 0.1104, "num_tokens": 6231232.0, "reward": 0.7550048828125, "reward_std": 0.011336077004671097, "rewards//mean": 0.7550048828125, "rewards//std": 0.038585059344768524, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1444, "grad_norm": 5.136916637420654, "kl": 1.792027784511447, "learning_rate": 9.553418304030885e-07, "loss": 0.1792, "num_tokens": 6239984.0, "reward": 0.74462890625, "reward_std": 0.021673094481229782, "rewards//mean": 0.74462890625, "rewards//std": 0.036027293652296066, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1446, "grad_norm": 4.544358253479004, "kl": 2.2820528466254473, "learning_rate": 9.552106472068897e-07, "loss": 0.2282, "num_tokens": 6248576.0, "reward": 0.73602294921875, "reward_std": 0.02027995139360428, "rewards//mean": 0.73602294921875, "rewards//std": 0.03474952653050423, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1448, "grad_norm": 6.115887641906738, "kl": 2.1123006008565426, "learning_rate": 9.550792806518967e-07, "loss": 0.2112, "num_tokens": 6257128.0, "reward": 0.764404296875, "reward_std": 0.01684088632464409, "rewards//mean": 0.764404296875, "rewards//std": 0.0381837859749794, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.145, "grad_norm": 4.227281093597412, "kl": 2.1123296599835157, "learning_rate": 9.549477307910236e-07, "loss": 0.2112, "num_tokens": 6265768.0, "reward": 0.7764892578125, "reward_std": 0.0175149068236351, "rewards//mean": 0.7764892578125, "rewards//std": 0.02934308350086212, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1452, "grad_norm": 4.551183223724365, "kl": 1.2583585027605295, "learning_rate": 9.548159976772592e-07, "loss": 0.1258, "num_tokens": 6274448.0, "reward": 0.7623291015625, "reward_std": 0.01506746280938387, "rewards//mean": 0.7623291015625, "rewards//std": 0.027011506259441376, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1454, "grad_norm": 6.423621654510498, "kl": 2.4540082439780235, "learning_rate": 9.546840813636652e-07, "loss": 0.2454, "num_tokens": 6283040.0, "reward": 0.74432373046875, "reward_std": 0.01908603496849537, "rewards//mean": 0.74432373046875, "rewards//std": 0.036972418427467346, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1456, "grad_norm": 6.135375499725342, "kl": 1.6809967551380396, "learning_rate": 9.545519819033777e-07, "loss": 0.1681, "num_tokens": 6291704.0, "reward": 0.7060546875, "reward_std": 0.01635526865720749, "rewards//mean": 0.7060546875, "rewards//std": 0.04816451296210289, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1458, "grad_norm": 10.800931930541992, "kl": 1.3858015835285187, "learning_rate": 9.544196993496062e-07, "loss": 0.1386, "num_tokens": 6300344.0, "reward": 0.7584228515625, "reward_std": 0.010101072490215302, "rewards//mean": 0.7584228515625, "rewards//std": 0.03870883211493492, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.146, "grad_norm": 5.990006923675537, "kl": 1.5613158121705055, "learning_rate": 9.54287233755634e-07, "loss": 0.1561, "num_tokens": 6308912.0, "reward": 0.757080078125, "reward_std": 0.02095198445022106, "rewards//mean": 0.757080078125, "rewards//std": 0.039756208658218384, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1462, "grad_norm": 7.086883068084717, "kl": 1.9290961921215057, "learning_rate": 9.541545851748185e-07, "loss": 0.1929, "num_tokens": 6317680.0, "reward": 0.73089599609375, "reward_std": 0.01549561694264412, "rewards//mean": 0.73089599609375, "rewards//std": 0.032764315605163574, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1464, "grad_norm": 4.728109359741211, "kl": 1.6669486071914434, "learning_rate": 9.540217536605905e-07, "loss": 0.1667, "num_tokens": 6326368.0, "reward": 0.79449462890625, "reward_std": 0.013113592751324177, "rewards//mean": 0.79449462890625, "rewards//std": 0.023812895640730858, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1466, "grad_norm": 13.25939655303955, "kl": 3.0511637795716524, "learning_rate": 9.538887392664543e-07, "loss": 0.3051, "num_tokens": 6335016.0, "reward": 0.73712158203125, "reward_std": 0.01813594251871109, "rewards//mean": 0.73712158203125, "rewards//std": 0.04806726053357124, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1468, "grad_norm": 3.8899991512298584, "kl": 1.5022244974970818, "learning_rate": 9.537555420459881e-07, "loss": 0.1502, "num_tokens": 6343744.0, "reward": 0.76983642578125, "reward_std": 0.016706019639968872, "rewards//mean": 0.76983642578125, "rewards//std": 0.03192707151174545, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.147, "grad_norm": 3.834383487701416, "kl": 1.3857309743762016, "learning_rate": 9.53622162052844e-07, "loss": 0.1386, "num_tokens": 6352456.0, "reward": 0.7703857421875, "reward_std": 0.010684870183467865, "rewards//mean": 0.7703857421875, "rewards//std": 0.031446777284145355, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1472, "grad_norm": 5.069825172424316, "kl": 1.4565946031361818, "learning_rate": 9.534885993407474e-07, "loss": 0.1457, "num_tokens": 6361208.0, "reward": 0.762939453125, "reward_std": 0.007309177424758673, "rewards//mean": 0.762939453125, "rewards//std": 0.01368957944214344, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1474, "grad_norm": 5.167150974273682, "kl": 1.761495502665639, "learning_rate": 9.53354853963497e-07, "loss": 0.1761, "num_tokens": 6369800.0, "reward": 0.75775146484375, "reward_std": 0.012911893427371979, "rewards//mean": 0.75775146484375, "rewards//std": 0.02882697992026806, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1476, "grad_norm": 6.316102027893066, "kl": 2.4946143217384815, "learning_rate": 9.532209259749658e-07, "loss": 0.2495, "num_tokens": 6378480.0, "reward": 0.739013671875, "reward_std": 0.01754099503159523, "rewards//mean": 0.739013671875, "rewards//std": 0.03786530718207359, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1478, "grad_norm": 3.7886784076690674, "kl": 2.09511149674654, "learning_rate": 9.530868154290996e-07, "loss": 0.2095, "num_tokens": 6387056.0, "reward": 0.7412109375, "reward_std": 0.02488623559474945, "rewards//mean": 0.7412109375, "rewards//std": 0.052757907658815384, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.148, "grad_norm": 5.090835094451904, "kl": 1.3681064881384373, "learning_rate": 9.529525223799184e-07, "loss": 0.1368, "num_tokens": 6395720.0, "reward": 0.77685546875, "reward_std": 0.010407458990812302, "rewards//mean": 0.77685546875, "rewards//std": 0.028641607612371445, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1482, "grad_norm": 12.468710899353027, "kl": 1.285261558368802, "learning_rate": 9.528180468815154e-07, "loss": 0.1285, "num_tokens": 6404408.0, "reward": 0.7642822265625, "reward_std": 0.013152715750038624, "rewards//mean": 0.7642822265625, "rewards//std": 0.024980410933494568, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1484, "grad_norm": 3.6978514194488525, "kl": 1.7975781913846731, "learning_rate": 9.526833889880572e-07, "loss": 0.1798, "num_tokens": 6413088.0, "reward": 0.76416015625, "reward_std": 0.01844346709549427, "rewards//mean": 0.76416015625, "rewards//std": 0.03828277066349983, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1486, "grad_norm": 4.001753330230713, "kl": 1.6544487792998552, "learning_rate": 9.525485487537841e-07, "loss": 0.1654, "num_tokens": 6421912.0, "reward": 0.768798828125, "reward_std": 0.01586316153407097, "rewards//mean": 0.768798828125, "rewards//std": 0.03408144786953926, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1488, "grad_norm": 3.864835500717163, "kl": 1.9045432973653078, "learning_rate": 9.524135262330098e-07, "loss": 0.1905, "num_tokens": 6430552.0, "reward": 0.77484130859375, "reward_std": 0.02079934999346733, "rewards//mean": 0.77484130859375, "rewards//std": 0.0369359627366066, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.149, "grad_norm": 6.792883396148682, "kl": 1.7718728762120008, "learning_rate": 9.522783214801211e-07, "loss": 0.1772, "num_tokens": 6439144.0, "reward": 0.76434326171875, "reward_std": 0.01758260279893875, "rewards//mean": 0.76434326171875, "rewards//std": 0.026512254029512405, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1492, "grad_norm": 5.843996047973633, "kl": 1.6257364582270384, "learning_rate": 9.521429345495786e-07, "loss": 0.1626, "num_tokens": 6447728.0, "reward": 0.7696533203125, "reward_std": 0.017090419307351112, "rewards//mean": 0.7696533203125, "rewards//std": 0.029943620786070824, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1494, "grad_norm": 5.6099066734313965, "kl": 1.8088597785681486, "learning_rate": 9.520073654959162e-07, "loss": 0.1809, "num_tokens": 6456312.0, "reward": 0.7591552734375, "reward_std": 0.016383163630962372, "rewards//mean": 0.7591552734375, "rewards//std": 0.03935418650507927, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1496, "grad_norm": 3.0421900749206543, "kl": 2.464435391128063, "learning_rate": 9.518716143737409e-07, "loss": 0.2464, "num_tokens": 6464936.0, "reward": 0.77044677734375, "reward_std": 0.019843457266688347, "rewards//mean": 0.77044677734375, "rewards//std": 0.029361844062805176, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1498, "grad_norm": 10.730456352233887, "kl": 1.909607894718647, "learning_rate": 9.517356812377335e-07, "loss": 0.191, "num_tokens": 6473664.0, "reward": 0.73126220703125, "reward_std": 0.01390963513404131, "rewards//mean": 0.73126220703125, "rewards//std": 0.042542073875665665, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.15, "grad_norm": 9.618935585021973, "kl": 2.2544579710811377, "learning_rate": 9.515995661426477e-07, "loss": 0.2254, "num_tokens": 6482200.0, "reward": 0.73931884765625, "reward_std": 0.012467009015381336, "rewards//mean": 0.73931884765625, "rewards//std": 0.03363339602947235, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1502, "grad_norm": 6.5307230949401855, "kl": 1.706093642860651, "learning_rate": 9.514632691433106e-07, "loss": 0.1706, "num_tokens": 6490712.0, "reward": 0.74200439453125, "reward_std": 0.012252597138285637, "rewards//mean": 0.74200439453125, "rewards//std": 0.04213160276412964, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1504, "grad_norm": 6.90416955947876, "kl": 2.5524584986269474, "learning_rate": 9.513267902946227e-07, "loss": 0.2552, "num_tokens": 6499392.0, "reward": 0.76019287109375, "reward_std": 0.01867758482694626, "rewards//mean": 0.76019287109375, "rewards//std": 0.03468542918562889, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1506, "grad_norm": 11.315291404724121, "kl": 3.1306357700377703, "learning_rate": 9.511901296515576e-07, "loss": 0.3131, "num_tokens": 6508160.0, "reward": 0.76019287109375, "reward_std": 0.02278970740735531, "rewards//mean": 0.76019287109375, "rewards//std": 0.03783806040883064, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1508, "grad_norm": 5.300631999969482, "kl": 1.964156363159418, "learning_rate": 9.510532872691623e-07, "loss": 0.1964, "num_tokens": 6516832.0, "reward": 0.72296142578125, "reward_std": 0.016734154894948006, "rewards//mean": 0.72296142578125, "rewards//std": 0.035924993455410004, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.151, "grad_norm": 5.230804443359375, "kl": 1.7890691291540861, "learning_rate": 9.509162632025569e-07, "loss": 0.1789, "num_tokens": 6525520.0, "reward": 0.7613525390625, "reward_std": 0.010999785736203194, "rewards//mean": 0.7613525390625, "rewards//std": 0.02713228575885296, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1512, "grad_norm": 14.658576011657715, "kl": 2.432515686377883, "learning_rate": 9.507790575069345e-07, "loss": 0.2433, "num_tokens": 6534128.0, "reward": 0.75335693359375, "reward_std": 0.016607044264674187, "rewards//mean": 0.75335693359375, "rewards//std": 0.04579753056168556, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1514, "grad_norm": 12.985304832458496, "kl": 2.872597623616457, "learning_rate": 9.506416702375617e-07, "loss": 0.2873, "num_tokens": 6542736.0, "reward": 0.73870849609375, "reward_std": 0.01667785458266735, "rewards//mean": 0.73870849609375, "rewards//std": 0.038795698434114456, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1516, "grad_norm": 8.331042289733887, "kl": 3.2442229371517897, "learning_rate": 9.505041014497779e-07, "loss": 0.3244, "num_tokens": 6551352.0, "reward": 0.74969482421875, "reward_std": 0.020662259310483932, "rewards//mean": 0.74969482421875, "rewards//std": 0.04038140922784805, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1518, "grad_norm": 10.844252586364746, "kl": 2.4486917965114117, "learning_rate": 9.503663511989962e-07, "loss": 0.2449, "num_tokens": 6560040.0, "reward": 0.75738525390625, "reward_std": 0.020630618557333946, "rewards//mean": 0.75738525390625, "rewards//std": 0.03961732238531113, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.152, "grad_norm": 6.808208465576172, "kl": 2.6468605156987906, "learning_rate": 9.502284195407018e-07, "loss": 0.2647, "num_tokens": 6568704.0, "reward": 0.73052978515625, "reward_std": 0.01908203214406967, "rewards//mean": 0.73052978515625, "rewards//std": 0.034255944192409515, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1522, "grad_norm": 8.170414924621582, "kl": 2.510532608255744, "learning_rate": 9.500903065304539e-07, "loss": 0.2511, "num_tokens": 6577336.0, "reward": 0.74847412109375, "reward_std": 0.018339160829782486, "rewards//mean": 0.74847412109375, "rewards//std": 0.03979726508259773, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1524, "grad_norm": 4.826385021209717, "kl": 1.82676731236279, "learning_rate": 9.499520122238845e-07, "loss": 0.1827, "num_tokens": 6585944.0, "reward": 0.74761962890625, "reward_std": 0.012161046266555786, "rewards//mean": 0.74761962890625, "rewards//std": 0.03418605402112007, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1526, "grad_norm": 5.022667407989502, "kl": 0.9693844858556986, "learning_rate": 9.498135366766982e-07, "loss": 0.0969, "num_tokens": 6594568.0, "reward": 0.76568603515625, "reward_std": 0.011609884910285473, "rewards//mean": 0.76568603515625, "rewards//std": 0.02998826466500759, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1528, "grad_norm": 7.236996173858643, "kl": 1.1193734277039766, "learning_rate": 9.496748799446732e-07, "loss": 0.1119, "num_tokens": 6603064.0, "reward": 0.736328125, "reward_std": 0.00927905272692442, "rewards//mean": 0.736328125, "rewards//std": 0.02765641175210476, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.153, "grad_norm": 5.365597248077393, "kl": 1.578007174655795, "learning_rate": 9.495360420836602e-07, "loss": 0.1578, "num_tokens": 6611760.0, "reward": 0.75299072265625, "reward_std": 0.0181528739631176, "rewards//mean": 0.75299072265625, "rewards//std": 0.03481176495552063, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1532, "grad_norm": 4.389156341552734, "kl": 1.9824351072311401, "learning_rate": 9.493970231495834e-07, "loss": 0.1982, "num_tokens": 6620544.0, "reward": 0.76898193359375, "reward_std": 0.019655738025903702, "rewards//mean": 0.76898193359375, "rewards//std": 0.044705647975206375, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1534, "grad_norm": 4.694593906402588, "kl": 1.9320093467831612, "learning_rate": 9.492578231984393e-07, "loss": 0.1932, "num_tokens": 6629192.0, "reward": 0.74249267578125, "reward_std": 0.012822737917304039, "rewards//mean": 0.74249267578125, "rewards//std": 0.03166809305548668, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1536, "grad_norm": 3.990737199783325, "kl": 1.785394612699747, "learning_rate": 9.491184422862979e-07, "loss": 0.1785, "num_tokens": 6637832.0, "reward": 0.72174072265625, "reward_std": 0.013771869242191315, "rewards//mean": 0.72174072265625, "rewards//std": 0.036096084862947464, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1538, "grad_norm": 3.8699402809143066, "kl": 1.9711164645850658, "learning_rate": 9.489788804693015e-07, "loss": 0.1971, "num_tokens": 6646552.0, "reward": 0.77679443359375, "reward_std": 0.024553906172513962, "rewards//mean": 0.77679443359375, "rewards//std": 0.045781660825014114, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.154, "grad_norm": 3.6297686100006104, "kl": 1.7416410017758608, "learning_rate": 9.488391378036659e-07, "loss": 0.1742, "num_tokens": 6655176.0, "reward": 0.7724609375, "reward_std": 0.015902843326330185, "rewards//mean": 0.7724609375, "rewards//std": 0.03722097724676132, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1542, "grad_norm": 11.240531921386719, "kl": 1.5410529263317585, "learning_rate": 9.486992143456791e-07, "loss": 0.1541, "num_tokens": 6663840.0, "reward": 0.761962890625, "reward_std": 0.013263905420899391, "rewards//mean": 0.761962890625, "rewards//std": 0.037814099341630936, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1544, "grad_norm": 10.247224807739258, "kl": 1.236491760239005, "learning_rate": 9.485591101517026e-07, "loss": 0.1236, "num_tokens": 6672440.0, "reward": 0.7432861328125, "reward_std": 0.009625021368265152, "rewards//mean": 0.7432861328125, "rewards//std": 0.026166634634137154, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1546, "grad_norm": 3.6303439140319824, "kl": 1.0167049001902342, "learning_rate": 9.4841882527817e-07, "loss": 0.1017, "num_tokens": 6680960.0, "reward": 0.75933837890625, "reward_std": 0.009588822722434998, "rewards//mean": 0.75933837890625, "rewards//std": 0.024974577128887177, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1548, "grad_norm": 4.442397117614746, "kl": 1.739076443016529, "learning_rate": 9.482783597815882e-07, "loss": 0.1739, "num_tokens": 6689760.0, "reward": 0.7567138671875, "reward_std": 0.02197597734630108, "rewards//mean": 0.7567138671875, "rewards//std": 0.04350633546710014, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.155, "grad_norm": 3.991363525390625, "kl": 1.2640179004520178, "learning_rate": 9.481377137185369e-07, "loss": 0.1264, "num_tokens": 6698392.0, "reward": 0.77264404296875, "reward_std": 0.0130799300968647, "rewards//mean": 0.77264404296875, "rewards//std": 0.029726143926382065, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1552, "grad_norm": 7.799238681793213, "kl": 1.2722574938088655, "learning_rate": 9.479968871456679e-07, "loss": 0.1272, "num_tokens": 6707040.0, "reward": 0.75006103515625, "reward_std": 0.008785752579569817, "rewards//mean": 0.75006103515625, "rewards//std": 0.029744980856776237, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1554, "grad_norm": 7.036464691162109, "kl": 1.5898052733391523, "learning_rate": 9.478558801197064e-07, "loss": 0.159, "num_tokens": 6715672.0, "reward": 0.77020263671875, "reward_std": 0.017948858439922333, "rewards//mean": 0.77020263671875, "rewards//std": 0.03062807209789753, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1556, "grad_norm": 4.282284259796143, "kl": 1.4198356419801712, "learning_rate": 9.4771469269745e-07, "loss": 0.142, "num_tokens": 6724312.0, "reward": 0.76318359375, "reward_std": 0.016161056235432625, "rewards//mean": 0.76318359375, "rewards//std": 0.03690403327345848, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1558, "grad_norm": 6.055977821350098, "kl": 1.6245936155319214, "learning_rate": 9.475733249357688e-07, "loss": 0.1625, "num_tokens": 6732968.0, "reward": 0.761962890625, "reward_std": 0.013400651514530182, "rewards//mean": 0.761962890625, "rewards//std": 0.040138185024261475, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.156, "grad_norm": 7.814728736877441, "kl": 2.0569280479103327, "learning_rate": 9.474317768916059e-07, "loss": 0.2057, "num_tokens": 6741616.0, "reward": 0.75927734375, "reward_std": 0.020186103880405426, "rewards//mean": 0.75927734375, "rewards//std": 0.035492222756147385, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1562, "grad_norm": 5.3690385818481445, "kl": 0.8401883486658335, "learning_rate": 9.472900486219768e-07, "loss": 0.084, "num_tokens": 6750248.0, "reward": 0.77740478515625, "reward_std": 0.019048381596803665, "rewards//mean": 0.77740478515625, "rewards//std": 0.03795868903398514, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1564, "grad_norm": 3.8546175956726074, "kl": 1.0615817327052355, "learning_rate": 9.471481401839696e-07, "loss": 0.1062, "num_tokens": 6758784.0, "reward": 0.76470947265625, "reward_std": 0.01313089206814766, "rewards//mean": 0.76470947265625, "rewards//std": 0.03293849527835846, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1566, "grad_norm": 6.704108715057373, "kl": 1.195159973576665, "learning_rate": 9.470060516347449e-07, "loss": 0.1195, "num_tokens": 6767448.0, "reward": 0.7789306640625, "reward_std": 0.015296168625354767, "rewards//mean": 0.7789306640625, "rewards//std": 0.027653949335217476, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1568, "grad_norm": 7.372798442840576, "kl": 1.1695575397461653, "learning_rate": 9.468637830315362e-07, "loss": 0.117, "num_tokens": 6776224.0, "reward": 0.71832275390625, "reward_std": 0.012001742608845234, "rewards//mean": 0.71832275390625, "rewards//std": 0.04022403433918953, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.157, "grad_norm": 9.356093406677246, "kl": 1.0185332987457514, "learning_rate": 9.467213344316491e-07, "loss": 0.1019, "num_tokens": 6784832.0, "reward": 0.7515869140625, "reward_std": 0.017094669863581657, "rewards//mean": 0.7515869140625, "rewards//std": 0.04253944754600525, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1572, "grad_norm": 5.204826354980469, "kl": 1.2351787276566029, "learning_rate": 9.465787058924619e-07, "loss": 0.1235, "num_tokens": 6793464.0, "reward": 0.78326416015625, "reward_std": 0.01512465812265873, "rewards//mean": 0.78326416015625, "rewards//std": 0.032436493784189224, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1574, "grad_norm": 9.974525451660156, "kl": 1.3910132851451635, "learning_rate": 9.464358974714252e-07, "loss": 0.1391, "num_tokens": 6801992.0, "reward": 0.765869140625, "reward_std": 0.02188878506422043, "rewards//mean": 0.765869140625, "rewards//std": 0.03544698655605316, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1576, "grad_norm": 3.495143175125122, "kl": 1.580335434526205, "learning_rate": 9.462929092260628e-07, "loss": 0.158, "num_tokens": 6810688.0, "reward": 0.73687744140625, "reward_std": 0.012116469442844391, "rewards//mean": 0.73687744140625, "rewards//std": 0.03512776643037796, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1578, "grad_norm": 5.81279182434082, "kl": 1.55155180208385, "learning_rate": 9.461497412139696e-07, "loss": 0.1552, "num_tokens": 6819336.0, "reward": 0.75189208984375, "reward_std": 0.01254141703248024, "rewards//mean": 0.75189208984375, "rewards//std": 0.04477670043706894, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.158, "grad_norm": 5.675829887390137, "kl": 1.8020419720560312, "learning_rate": 9.460063934928141e-07, "loss": 0.1802, "num_tokens": 6828088.0, "reward": 0.7413330078125, "reward_std": 0.014477964490652084, "rewards//mean": 0.7413330078125, "rewards//std": 0.03759951889514923, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1582, "grad_norm": 3.879511594772339, "kl": 2.192951174452901, "learning_rate": 9.458628661203366e-07, "loss": 0.2193, "num_tokens": 6836808.0, "reward": 0.7481689453125, "reward_std": 0.016593217849731445, "rewards//mean": 0.7481689453125, "rewards//std": 0.02330244705080986, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1584, "grad_norm": 4.301689147949219, "kl": 1.8302797842770815, "learning_rate": 9.4571915915435e-07, "loss": 0.183, "num_tokens": 6845400.0, "reward": 0.74737548828125, "reward_std": 0.018161125481128693, "rewards//mean": 0.74737548828125, "rewards//std": 0.031389620155096054, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1586, "grad_norm": 11.259052276611328, "kl": 1.885145427659154, "learning_rate": 9.455752726527392e-07, "loss": 0.1885, "num_tokens": 6854096.0, "reward": 0.720703125, "reward_std": 0.017080646008253098, "rewards//mean": 0.720703125, "rewards//std": 0.03896316885948181, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1588, "grad_norm": 11.50151538848877, "kl": 1.2966818679124117, "learning_rate": 9.454312066734622e-07, "loss": 0.1297, "num_tokens": 6862656.0, "reward": 0.7252197265625, "reward_std": 0.012704441323876381, "rewards//mean": 0.7252197265625, "rewards//std": 0.024351980537176132, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.159, "grad_norm": 6.524673938751221, "kl": 1.916914639994502, "learning_rate": 9.452869612745483e-07, "loss": 0.1917, "num_tokens": 6871248.0, "reward": 0.75933837890625, "reward_std": 0.01959666982293129, "rewards//mean": 0.75933837890625, "rewards//std": 0.03807416185736656, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1592, "grad_norm": 5.548999309539795, "kl": 1.4934558384120464, "learning_rate": 9.451425365140994e-07, "loss": 0.1493, "num_tokens": 6879968.0, "reward": 0.75018310546875, "reward_std": 0.013776395469903946, "rewards//mean": 0.75018310546875, "rewards//std": 0.03590560704469681, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1594, "grad_norm": 7.6942138671875, "kl": 1.9479884691536427, "learning_rate": 9.449979324502903e-07, "loss": 0.1948, "num_tokens": 6888536.0, "reward": 0.74859619140625, "reward_std": 0.023634785786271095, "rewards//mean": 0.74859619140625, "rewards//std": 0.03557311370968819, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1596, "grad_norm": 6.731383323669434, "kl": 2.307126719504595, "learning_rate": 9.448531491413672e-07, "loss": 0.2307, "num_tokens": 6897080.0, "reward": 0.7607421875, "reward_std": 0.013529423624277115, "rewards//mean": 0.7607421875, "rewards//std": 0.03001658245921135, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1598, "grad_norm": 10.119559288024902, "kl": 2.3343022875487804, "learning_rate": 9.447081866456487e-07, "loss": 0.2334, "num_tokens": 6905712.0, "reward": 0.75238037109375, "reward_std": 0.013632997870445251, "rewards//mean": 0.75238037109375, "rewards//std": 0.03842874616384506, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.16, "grad_norm": 7.11630392074585, "kl": 2.8813906107097864, "learning_rate": 9.445630450215259e-07, "loss": 0.2881, "num_tokens": 6914304.0, "reward": 0.75439453125, "reward_std": 0.019328434020280838, "rewards//mean": 0.75439453125, "rewards//std": 0.028877412900328636, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1602, "grad_norm": 6.320123672485352, "kl": 2.823726534843445, "learning_rate": 9.444177243274617e-07, "loss": 0.2824, "num_tokens": 6922960.0, "reward": 0.7745361328125, "reward_std": 0.021904705092310905, "rewards//mean": 0.7745361328125, "rewards//std": 0.03933725878596306, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1604, "grad_norm": 8.631261825561523, "kl": 2.897394433617592, "learning_rate": 9.442722246219913e-07, "loss": 0.2897, "num_tokens": 6931632.0, "reward": 0.72723388671875, "reward_std": 0.019145509228110313, "rewards//mean": 0.72723388671875, "rewards//std": 0.043193940073251724, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1606, "grad_norm": 12.911233901977539, "kl": 2.2146845385432243, "learning_rate": 9.441265459637219e-07, "loss": 0.2215, "num_tokens": 6940272.0, "reward": 0.7568359375, "reward_std": 0.019703920930624008, "rewards//mean": 0.7568359375, "rewards//std": 0.034993976354599, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1608, "grad_norm": 22.874841690063477, "kl": 2.5433691050857306, "learning_rate": 9.43980688411333e-07, "loss": 0.2543, "num_tokens": 6948856.0, "reward": 0.72515869140625, "reward_std": 0.02106834389269352, "rewards//mean": 0.72515869140625, "rewards//std": 0.04134201630949974, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.161, "grad_norm": 10.111599922180176, "kl": 2.4337373562157154, "learning_rate": 9.438346520235758e-07, "loss": 0.2434, "num_tokens": 6957592.0, "reward": 0.7315673828125, "reward_std": 0.020331665873527527, "rewards//mean": 0.7315673828125, "rewards//std": 0.043103642761707306, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1612, "grad_norm": 3.873319625854492, "kl": 2.255781589075923, "learning_rate": 9.436884368592739e-07, "loss": 0.2256, "num_tokens": 6966240.0, "reward": 0.75885009765625, "reward_std": 0.01403405237942934, "rewards//mean": 0.75885009765625, "rewards//std": 0.03315470740199089, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1614, "grad_norm": 7.305696964263916, "kl": 2.495479291304946, "learning_rate": 9.435420429773227e-07, "loss": 0.2495, "num_tokens": 6974904.0, "reward": 0.71661376953125, "reward_std": 0.025170881301164627, "rewards//mean": 0.71661376953125, "rewards//std": 0.05750236287713051, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1616, "grad_norm": 4.483618259429932, "kl": 1.3268167339265347, "learning_rate": 9.433954704366896e-07, "loss": 0.1327, "num_tokens": 6983504.0, "reward": 0.77825927734375, "reward_std": 0.015569431707262993, "rewards//mean": 0.77825927734375, "rewards//std": 0.033930934965610504, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1618, "grad_norm": 3.663708209991455, "kl": 1.371718443930149, "learning_rate": 9.43248719296414e-07, "loss": 0.1372, "num_tokens": 6992216.0, "reward": 0.75067138671875, "reward_std": 0.014409595169126987, "rewards//mean": 0.75067138671875, "rewards//std": 0.03732205927371979, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.162, "grad_norm": 5.115084648132324, "kl": 1.7715710327029228, "learning_rate": 9.431017896156073e-07, "loss": 0.1772, "num_tokens": 7000816.0, "reward": 0.76171875, "reward_std": 0.02263561636209488, "rewards//mean": 0.76171875, "rewards//std": 0.05361855775117874, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1622, "grad_norm": 9.0412015914917, "kl": 1.2946567293256521, "learning_rate": 9.429546814534528e-07, "loss": 0.1295, "num_tokens": 7009480.0, "reward": 0.7667236328125, "reward_std": 0.01590133085846901, "rewards//mean": 0.7667236328125, "rewards//std": 0.0302694384008646, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1624, "grad_norm": 7.579761505126953, "kl": 1.7319267839193344, "learning_rate": 9.428073948692054e-07, "loss": 0.1732, "num_tokens": 7018136.0, "reward": 0.74456787109375, "reward_std": 0.010734092444181442, "rewards//mean": 0.74456787109375, "rewards//std": 0.03490122780203819, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1626, "grad_norm": 3.380894422531128, "kl": 1.1995777301490307, "learning_rate": 9.426599299221924e-07, "loss": 0.12, "num_tokens": 7026824.0, "reward": 0.75201416015625, "reward_std": 0.014339606277644634, "rewards//mean": 0.75201416015625, "rewards//std": 0.03800651431083679, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1628, "grad_norm": 6.378350257873535, "kl": 2.075379339978099, "learning_rate": 9.425122866718127e-07, "loss": 0.2075, "num_tokens": 7035488.0, "reward": 0.75030517578125, "reward_std": 0.019947510212659836, "rewards//mean": 0.75030517578125, "rewards//std": 0.038846779614686966, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.163, "grad_norm": 3.568049907684326, "kl": 1.3485429864376783, "learning_rate": 9.423644651775368e-07, "loss": 0.1349, "num_tokens": 7044072.0, "reward": 0.75067138671875, "reward_std": 0.01583707332611084, "rewards//mean": 0.75067138671875, "rewards//std": 0.027801012620329857, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1632, "grad_norm": 5.05710506439209, "kl": 1.1457455083727837, "learning_rate": 9.422164654989071e-07, "loss": 0.1146, "num_tokens": 7052744.0, "reward": 0.7564697265625, "reward_std": 0.012639300897717476, "rewards//mean": 0.7564697265625, "rewards//std": 0.030058663338422775, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1634, "grad_norm": 19.65692901611328, "kl": 1.0076731331646442, "learning_rate": 9.420682876955381e-07, "loss": 0.1008, "num_tokens": 7061384.0, "reward": 0.73345947265625, "reward_std": 0.014296118170022964, "rewards//mean": 0.73345947265625, "rewards//std": 0.03193797543644905, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1636, "grad_norm": 3.6845571994781494, "kl": 1.843559268862009, "learning_rate": 9.419199318271156e-07, "loss": 0.1844, "num_tokens": 7070008.0, "reward": 0.73309326171875, "reward_std": 0.010316584259271622, "rewards//mean": 0.73309326171875, "rewards//std": 0.040529586374759674, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1638, "grad_norm": 9.300296783447266, "kl": 1.147592481225729, "learning_rate": 9.417713979533974e-07, "loss": 0.1148, "num_tokens": 7078680.0, "reward": 0.77520751953125, "reward_std": 0.015799185261130333, "rewards//mean": 0.77520751953125, "rewards//std": 0.028133351355791092, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.164, "grad_norm": 3.4130032062530518, "kl": 1.270079467445612, "learning_rate": 9.41622686134213e-07, "loss": 0.127, "num_tokens": 7087336.0, "reward": 0.7349853515625, "reward_std": 0.011345919221639633, "rewards//mean": 0.7349853515625, "rewards//std": 0.03531242161989212, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1642, "grad_norm": 7.637694835662842, "kl": 1.5577540304511786, "learning_rate": 9.414737964294634e-07, "loss": 0.1558, "num_tokens": 7096016.0, "reward": 0.75079345703125, "reward_std": 0.018049750477075577, "rewards//mean": 0.75079345703125, "rewards//std": 0.03023509867489338, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1644, "grad_norm": 4.424720287322998, "kl": 1.630806166678667, "learning_rate": 9.413247288991215e-07, "loss": 0.1631, "num_tokens": 7104736.0, "reward": 0.71746826171875, "reward_std": 0.009988827630877495, "rewards//mean": 0.71746826171875, "rewards//std": 0.03421039879322052, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1646, "grad_norm": 5.737839698791504, "kl": 1.7182586211711168, "learning_rate": 9.411754836032314e-07, "loss": 0.1718, "num_tokens": 7113368.0, "reward": 0.75677490234375, "reward_std": 0.016060881316661835, "rewards//mean": 0.75677490234375, "rewards//std": 0.03617316484451294, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1648, "grad_norm": 3.126258373260498, "kl": 1.0620542783290148, "learning_rate": 9.410260606019094e-07, "loss": 0.1062, "num_tokens": 7122128.0, "reward": 0.78106689453125, "reward_std": 0.00825115293264389, "rewards//mean": 0.78106689453125, "rewards//std": 0.02952379733324051, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.165, "grad_norm": 4.497267723083496, "kl": 1.0894294548779726, "learning_rate": 9.408764599553428e-07, "loss": 0.1089, "num_tokens": 7130776.0, "reward": 0.7479248046875, "reward_std": 0.009177840314805508, "rewards//mean": 0.7479248046875, "rewards//std": 0.02511337399482727, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1652, "grad_norm": 5.435197830200195, "kl": 1.483167264610529, "learning_rate": 9.40726681723791e-07, "loss": 0.1483, "num_tokens": 7139528.0, "reward": 0.73291015625, "reward_std": 0.0182212982326746, "rewards//mean": 0.73291015625, "rewards//std": 0.03925732895731926, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1654, "grad_norm": 14.3667573928833, "kl": 1.5110141914337873, "learning_rate": 9.405767259675844e-07, "loss": 0.1511, "num_tokens": 7148128.0, "reward": 0.721435546875, "reward_std": 0.014402812346816063, "rewards//mean": 0.721435546875, "rewards//std": 0.05114896222949028, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1656, "grad_norm": 9.0249605178833, "kl": 1.7240530531853437, "learning_rate": 9.404265927471253e-07, "loss": 0.1724, "num_tokens": 7156728.0, "reward": 0.7257080078125, "reward_std": 0.011079209856688976, "rewards//mean": 0.7257080078125, "rewards//std": 0.037683166563510895, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1658, "grad_norm": 2.765624523162842, "kl": 1.5330549646168947, "learning_rate": 9.402762821228874e-07, "loss": 0.1533, "num_tokens": 7165408.0, "reward": 0.7630615234375, "reward_std": 0.017075177282094955, "rewards//mean": 0.7630615234375, "rewards//std": 0.033683598041534424, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.166, "grad_norm": 4.675115585327148, "kl": 1.9389816485345364, "learning_rate": 9.401257941554156e-07, "loss": 0.1939, "num_tokens": 7174016.0, "reward": 0.757568359375, "reward_std": 0.01400386355817318, "rewards//mean": 0.757568359375, "rewards//std": 0.043995749205350876, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1662, "grad_norm": 5.841732978820801, "kl": 2.3723955769091845, "learning_rate": 9.399751289053266e-07, "loss": 0.2372, "num_tokens": 7182656.0, "reward": 0.74578857421875, "reward_std": 0.020208366215229034, "rewards//mean": 0.74578857421875, "rewards//std": 0.03778481483459473, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1664, "grad_norm": 4.87501859664917, "kl": 1.6249112337827682, "learning_rate": 9.398242864333083e-07, "loss": 0.1625, "num_tokens": 7191360.0, "reward": 0.7568359375, "reward_std": 0.008860049769282341, "rewards//mean": 0.7568359375, "rewards//std": 0.037311967462301254, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1666, "grad_norm": 5.310845851898193, "kl": 2.1611604560166597, "learning_rate": 9.396732668001199e-07, "loss": 0.2161, "num_tokens": 7200080.0, "reward": 0.7513427734375, "reward_std": 0.016101088374853134, "rewards//mean": 0.7513427734375, "rewards//std": 0.026358000934123993, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1668, "grad_norm": 2.9059054851531982, "kl": 1.839319683611393, "learning_rate": 9.395220700665922e-07, "loss": 0.1839, "num_tokens": 7208816.0, "reward": 0.76507568359375, "reward_std": 0.019753575325012207, "rewards//mean": 0.76507568359375, "rewards//std": 0.03695235028862953, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.167, "grad_norm": 5.503809928894043, "kl": 2.153663218021393, "learning_rate": 9.393706962936274e-07, "loss": 0.2154, "num_tokens": 7217488.0, "reward": 0.75311279296875, "reward_std": 0.017211418598890305, "rewards//mean": 0.75311279296875, "rewards//std": 0.03834396228194237, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1672, "grad_norm": 4.380270957946777, "kl": 2.353607654571533, "learning_rate": 9.392191455421987e-07, "loss": 0.2354, "num_tokens": 7226024.0, "reward": 0.7587890625, "reward_std": 0.017819222062826157, "rewards//mean": 0.7587890625, "rewards//std": 0.03496628254652023, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1674, "grad_norm": 5.176096439361572, "kl": 2.640985831618309, "learning_rate": 9.390674178733507e-07, "loss": 0.2641, "num_tokens": 7234672.0, "reward": 0.7403564453125, "reward_std": 0.022800642997026443, "rewards//mean": 0.7403564453125, "rewards//std": 0.04057685285806656, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1676, "grad_norm": 7.680821895599365, "kl": 2.7808349207043648, "learning_rate": 9.389155133481992e-07, "loss": 0.2781, "num_tokens": 7243248.0, "reward": 0.717041015625, "reward_std": 0.018634147942066193, "rewards//mean": 0.717041015625, "rewards//std": 0.03916081041097641, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1678, "grad_norm": 10.537120819091797, "kl": 2.879847614094615, "learning_rate": 9.387634320279314e-07, "loss": 0.288, "num_tokens": 7252024.0, "reward": 0.73858642578125, "reward_std": 0.01671629585325718, "rewards//mean": 0.73858642578125, "rewards//std": 0.033834878355264664, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.168, "grad_norm": 6.859736919403076, "kl": 2.1426927223801613, "learning_rate": 9.386111739738056e-07, "loss": 0.2143, "num_tokens": 7260632.0, "reward": 0.75164794921875, "reward_std": 0.011810492724180222, "rewards//mean": 0.75164794921875, "rewards//std": 0.034406304359436035, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1682, "grad_norm": 8.232377052307129, "kl": 2.640538850799203, "learning_rate": 9.384587392471514e-07, "loss": 0.2641, "num_tokens": 7269232.0, "reward": 0.75140380859375, "reward_std": 0.023731769993901253, "rewards//mean": 0.75140380859375, "rewards//std": 0.03931199759244919, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1684, "grad_norm": 6.554054260253906, "kl": 1.5579719077795744, "learning_rate": 9.383061279093696e-07, "loss": 0.1558, "num_tokens": 7277888.0, "reward": 0.74188232421875, "reward_std": 0.014754341915249825, "rewards//mean": 0.74188232421875, "rewards//std": 0.023850372061133385, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1686, "grad_norm": 6.298922061920166, "kl": 1.7747175488620996, "learning_rate": 9.381533400219317e-07, "loss": 0.1775, "num_tokens": 7286528.0, "reward": 0.76763916015625, "reward_std": 0.01292702741920948, "rewards//mean": 0.76763916015625, "rewards//std": 0.03302844986319542, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1688, "grad_norm": 4.180309772491455, "kl": 1.7337059956043959, "learning_rate": 9.38000375646381e-07, "loss": 0.1734, "num_tokens": 7295168.0, "reward": 0.753662109375, "reward_std": 0.01970200054347515, "rewards//mean": 0.753662109375, "rewards//std": 0.037724319845438004, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.169, "grad_norm": 3.2928497791290283, "kl": 2.2027333453297615, "learning_rate": 9.378472348443314e-07, "loss": 0.2203, "num_tokens": 7303800.0, "reward": 0.7252197265625, "reward_std": 0.0142702367156744, "rewards//mean": 0.7252197265625, "rewards//std": 0.024785738438367844, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1692, "grad_norm": 4.348998069763184, "kl": 1.6784103065729141, "learning_rate": 9.376939176774677e-07, "loss": 0.1678, "num_tokens": 7312504.0, "reward": 0.77056884765625, "reward_std": 0.01693711057305336, "rewards//mean": 0.77056884765625, "rewards//std": 0.039255738258361816, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1694, "grad_norm": 4.055181503295898, "kl": 2.1964838411659002, "learning_rate": 9.375404242075466e-07, "loss": 0.2196, "num_tokens": 7321176.0, "reward": 0.748291015625, "reward_std": 0.018741052597761154, "rewards//mean": 0.748291015625, "rewards//std": 0.036384351551532745, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1696, "grad_norm": 4.051187515258789, "kl": 1.29633485712111, "learning_rate": 9.373867544963948e-07, "loss": 0.1296, "num_tokens": 7329792.0, "reward": 0.7618408203125, "reward_std": 0.013074061833322048, "rewards//mean": 0.7618408203125, "rewards//std": 0.028279660269618034, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1698, "grad_norm": 7.050297737121582, "kl": 0.9690040778368711, "learning_rate": 9.372329086059107e-07, "loss": 0.0969, "num_tokens": 7338440.0, "reward": 0.77362060546875, "reward_std": 0.009072763845324516, "rewards//mean": 0.77362060546875, "rewards//std": 0.021947842091321945, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.17, "grad_norm": 3.919147491455078, "kl": 1.2022516168653965, "learning_rate": 9.370788865980632e-07, "loss": 0.1202, "num_tokens": 7347016.0, "reward": 0.75799560546875, "reward_std": 0.013911853544414043, "rewards//mean": 0.75799560546875, "rewards//std": 0.027307672426104546, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1702, "grad_norm": 4.519155979156494, "kl": 1.4689824804663658, "learning_rate": 9.369246885348925e-07, "loss": 0.1469, "num_tokens": 7355664.0, "reward": 0.700439453125, "reward_std": 0.00852000992745161, "rewards//mean": 0.700439453125, "rewards//std": 0.035481132566928864, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1704, "grad_norm": 6.284739971160889, "kl": 2.6096420623362064, "learning_rate": 9.367703144785095e-07, "loss": 0.261, "num_tokens": 7364392.0, "reward": 0.75140380859375, "reward_std": 0.018157094717025757, "rewards//mean": 0.75140380859375, "rewards//std": 0.04684588685631752, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1706, "grad_norm": 2.89241886138916, "kl": 1.513953823596239, "learning_rate": 9.366157644910959e-07, "loss": 0.1514, "num_tokens": 7373032.0, "reward": 0.741943359375, "reward_std": 0.012907277792692184, "rewards//mean": 0.741943359375, "rewards//std": 0.035535700619220734, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1708, "grad_norm": 6.478793621063232, "kl": 1.2654035575687885, "learning_rate": 9.364610386349047e-07, "loss": 0.1265, "num_tokens": 7381688.0, "reward": 0.75518798828125, "reward_std": 0.01790492609143257, "rewards//mean": 0.75518798828125, "rewards//std": 0.028203211724758148, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.171, "grad_norm": 4.850090980529785, "kl": 1.4152612686157227, "learning_rate": 9.363061369722594e-07, "loss": 0.1415, "num_tokens": 7390344.0, "reward": 0.76995849609375, "reward_std": 0.014351680874824524, "rewards//mean": 0.76995849609375, "rewards//std": 0.027936264872550964, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1712, "grad_norm": 5.298336982727051, "kl": 1.8484514653682709, "learning_rate": 9.361510595655544e-07, "loss": 0.1848, "num_tokens": 7398960.0, "reward": 0.7615966796875, "reward_std": 0.017985302954912186, "rewards//mean": 0.7615966796875, "rewards//std": 0.032068345695734024, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1714, "grad_norm": 3.7696399688720703, "kl": 1.8542726691812277, "learning_rate": 9.359958064772546e-07, "loss": 0.1854, "num_tokens": 7407632.0, "reward": 0.7333984375, "reward_std": 0.015964325517416, "rewards//mean": 0.7333984375, "rewards//std": 0.03637852892279625, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1716, "grad_norm": 4.572277069091797, "kl": 2.77557560056448, "learning_rate": 9.35840377769896e-07, "loss": 0.2776, "num_tokens": 7416224.0, "reward": 0.7694091796875, "reward_std": 0.02960910275578499, "rewards//mean": 0.7694091796875, "rewards//std": 0.03893188014626503, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1718, "grad_norm": 4.572153091430664, "kl": 2.2407423984259367, "learning_rate": 9.356847735060856e-07, "loss": 0.2241, "num_tokens": 7424872.0, "reward": 0.7784423828125, "reward_std": 0.023178264498710632, "rewards//mean": 0.7784423828125, "rewards//std": 0.034340519458055496, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.172, "grad_norm": 4.627325534820557, "kl": 1.7834851872175932, "learning_rate": 9.355289937485004e-07, "loss": 0.1783, "num_tokens": 7433544.0, "reward": 0.765625, "reward_std": 0.01757705584168434, "rewards//mean": 0.765625, "rewards//std": 0.03353790193796158, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1722, "grad_norm": 7.0663580894470215, "kl": 2.7586093079298735, "learning_rate": 9.353730385598886e-07, "loss": 0.2759, "num_tokens": 7442144.0, "reward": 0.756103515625, "reward_std": 0.023987147957086563, "rewards//mean": 0.756103515625, "rewards//std": 0.039228782057762146, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1724, "grad_norm": 7.226685523986816, "kl": 2.695838078856468, "learning_rate": 9.35216908003069e-07, "loss": 0.2696, "num_tokens": 7450776.0, "reward": 0.74609375, "reward_std": 0.020054344087839127, "rewards//mean": 0.74609375, "rewards//std": 0.041805945336818695, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1726, "grad_norm": 14.021259307861328, "kl": 2.264453437179327, "learning_rate": 9.350606021409308e-07, "loss": 0.2264, "num_tokens": 7459440.0, "reward": 0.77838134765625, "reward_std": 0.022445324808359146, "rewards//mean": 0.77838134765625, "rewards//std": 0.029183952137827873, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1728, "grad_norm": 5.810389518737793, "kl": 2.81511683575809, "learning_rate": 9.349041210364341e-07, "loss": 0.2815, "num_tokens": 7468160.0, "reward": 0.76959228515625, "reward_std": 0.01997082307934761, "rewards//mean": 0.76959228515625, "rewards//std": 0.032885145395994186, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.173, "grad_norm": 4.912047863006592, "kl": 1.9661368392407894, "learning_rate": 9.347474647526095e-07, "loss": 0.1966, "num_tokens": 7476752.0, "reward": 0.7652587890625, "reward_std": 0.011315623298287392, "rewards//mean": 0.7652587890625, "rewards//std": 0.019768141210079193, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1732, "grad_norm": 6.937151908874512, "kl": 2.165360538288951, "learning_rate": 9.34590633352558e-07, "loss": 0.2165, "num_tokens": 7485448.0, "reward": 0.76153564453125, "reward_std": 0.02091323770582676, "rewards//mean": 0.76153564453125, "rewards//std": 0.03633475676178932, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1734, "grad_norm": 46.60414505004883, "kl": 4.745315488427877, "learning_rate": 9.344336268994515e-07, "loss": 0.4745, "num_tokens": 7494104.0, "reward": 0.71124267578125, "reward_std": 0.024051643908023834, "rewards//mean": 0.71124267578125, "rewards//std": 0.038256216794252396, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1736, "grad_norm": 4.389271259307861, "kl": 1.9652090054005384, "learning_rate": 9.342764454565319e-07, "loss": 0.1965, "num_tokens": 7502752.0, "reward": 0.75091552734375, "reward_std": 0.016398733481764793, "rewards//mean": 0.75091552734375, "rewards//std": 0.03186964988708496, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1738, "grad_norm": 3.826873779296875, "kl": 1.7477713953703642, "learning_rate": 9.341190890871121e-07, "loss": 0.1748, "num_tokens": 7511400.0, "reward": 0.71832275390625, "reward_std": 0.01658746972680092, "rewards//mean": 0.71832275390625, "rewards//std": 0.03841298818588257, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.174, "grad_norm": 15.073668479919434, "kl": 2.9960688669234514, "learning_rate": 9.339615578545752e-07, "loss": 0.2996, "num_tokens": 7520040.0, "reward": 0.7525634765625, "reward_std": 0.020237531512975693, "rewards//mean": 0.7525634765625, "rewards//std": 0.03274288401007652, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1742, "grad_norm": 7.638735771179199, "kl": 2.2340067364275455, "learning_rate": 9.338038518223745e-07, "loss": 0.2234, "num_tokens": 7528672.0, "reward": 0.77197265625, "reward_std": 0.025480207055807114, "rewards//mean": 0.77197265625, "rewards//std": 0.03820677474141121, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1744, "grad_norm": 7.6421051025390625, "kl": 1.5636887550354004, "learning_rate": 9.336459710540343e-07, "loss": 0.1564, "num_tokens": 7537480.0, "reward": 0.75494384765625, "reward_std": 0.01364810299128294, "rewards//mean": 0.75494384765625, "rewards//std": 0.031337011605501175, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1746, "grad_norm": 5.859604835510254, "kl": 1.966890512034297, "learning_rate": 9.334879156131488e-07, "loss": 0.1967, "num_tokens": 7546136.0, "reward": 0.77093505859375, "reward_std": 0.02118687331676483, "rewards//mean": 0.77093505859375, "rewards//std": 0.030870771035552025, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1748, "grad_norm": 19.557403564453125, "kl": 2.767997670918703, "learning_rate": 9.333296855633827e-07, "loss": 0.2768, "num_tokens": 7554760.0, "reward": 0.75396728515625, "reward_std": 0.018575340509414673, "rewards//mean": 0.75396728515625, "rewards//std": 0.03726319968700409, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.175, "grad_norm": 7.539550304412842, "kl": 2.011590525507927, "learning_rate": 9.331712809684711e-07, "loss": 0.2012, "num_tokens": 7563360.0, "reward": 0.77105712890625, "reward_std": 0.018703941255807877, "rewards//mean": 0.77105712890625, "rewards//std": 0.03832145035266876, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1752, "grad_norm": 6.650249481201172, "kl": 1.3714905809611082, "learning_rate": 9.330127018922193e-07, "loss": 0.1371, "num_tokens": 7572032.0, "reward": 0.75164794921875, "reward_std": 0.0170736201107502, "rewards//mean": 0.75164794921875, "rewards//std": 0.03142625093460083, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1754, "grad_norm": 6.059285640716553, "kl": 1.7188973929733038, "learning_rate": 9.32853948398503e-07, "loss": 0.1719, "num_tokens": 7580640.0, "reward": 0.73638916015625, "reward_std": 0.018722107633948326, "rewards//mean": 0.73638916015625, "rewards//std": 0.03858206793665886, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1756, "grad_norm": 7.04297399520874, "kl": 1.3471319321542978, "learning_rate": 9.32695020551268e-07, "loss": 0.1347, "num_tokens": 7589144.0, "reward": 0.7181396484375, "reward_std": 0.006738506257534027, "rewards//mean": 0.7181396484375, "rewards//std": 0.0360657200217247, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1758, "grad_norm": 14.926488876342773, "kl": 1.8079385627061129, "learning_rate": 9.325359184145305e-07, "loss": 0.1808, "num_tokens": 7597784.0, "reward": 0.73773193359375, "reward_std": 0.009101016446948051, "rewards//mean": 0.73773193359375, "rewards//std": 0.02999129332602024, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.176, "grad_norm": 6.365018844604492, "kl": 1.6669358722865582, "learning_rate": 9.323766420523767e-07, "loss": 0.1667, "num_tokens": 7606448.0, "reward": 0.75244140625, "reward_std": 0.013080522418022156, "rewards//mean": 0.75244140625, "rewards//std": 0.02585030160844326, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1762, "grad_norm": 5.74120569229126, "kl": 1.0877815838903189, "learning_rate": 9.322171915289633e-07, "loss": 0.1088, "num_tokens": 7615096.0, "reward": 0.74676513671875, "reward_std": 0.011526472866535187, "rewards//mean": 0.74676513671875, "rewards//std": 0.030588505789637566, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1764, "grad_norm": 4.031838893890381, "kl": 2.4308047238737345, "learning_rate": 9.320575669085169e-07, "loss": 0.2431, "num_tokens": 7623656.0, "reward": 0.7398681640625, "reward_std": 0.021138392388820648, "rewards//mean": 0.7398681640625, "rewards//std": 0.03511640056967735, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1766, "grad_norm": 5.192139148712158, "kl": 1.3053984548896551, "learning_rate": 9.31897768255334e-07, "loss": 0.1305, "num_tokens": 7632288.0, "reward": 0.75262451171875, "reward_std": 0.014959865249693394, "rewards//mean": 0.75262451171875, "rewards//std": 0.03647862374782562, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1768, "grad_norm": 6.582054615020752, "kl": 1.6043333057314157, "learning_rate": 9.317377956337818e-07, "loss": 0.1604, "num_tokens": 7640848.0, "reward": 0.72216796875, "reward_std": 0.011610300280153751, "rewards//mean": 0.72216796875, "rewards//std": 0.03053259663283825, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.177, "grad_norm": 3.4720489978790283, "kl": 1.6192547511309385, "learning_rate": 9.315776491082972e-07, "loss": 0.1619, "num_tokens": 7649536.0, "reward": 0.7227783203125, "reward_std": 0.017359483987092972, "rewards//mean": 0.7227783203125, "rewards//std": 0.03739605471491814, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1772, "grad_norm": 4.207780361175537, "kl": 2.168350428342819, "learning_rate": 9.314173287433872e-07, "loss": 0.2168, "num_tokens": 7658176.0, "reward": 0.7642822265625, "reward_std": 0.01290786825120449, "rewards//mean": 0.7642822265625, "rewards//std": 0.02707419916987419, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1774, "grad_norm": 6.088476657867432, "kl": 1.813038071617484, "learning_rate": 9.312568346036287e-07, "loss": 0.1813, "num_tokens": 7666712.0, "reward": 0.76885986328125, "reward_std": 0.0204804427921772, "rewards//mean": 0.76885986328125, "rewards//std": 0.03475475311279297, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1776, "grad_norm": 5.763800144195557, "kl": 1.953675914555788, "learning_rate": 9.310961667536688e-07, "loss": 0.1954, "num_tokens": 7675416.0, "reward": 0.75189208984375, "reward_std": 0.009357169270515442, "rewards//mean": 0.75189208984375, "rewards//std": 0.038190871477127075, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1778, "grad_norm": 8.297173500061035, "kl": 1.1260207556188107, "learning_rate": 9.309353252582245e-07, "loss": 0.1126, "num_tokens": 7684048.0, "reward": 0.7529296875, "reward_std": 0.014209197834134102, "rewards//mean": 0.7529296875, "rewards//std": 0.03625848889350891, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.178, "grad_norm": 4.038049697875977, "kl": 1.6453554145991802, "learning_rate": 9.307743101820827e-07, "loss": 0.1645, "num_tokens": 7692680.0, "reward": 0.76947021484375, "reward_std": 0.017768949270248413, "rewards//mean": 0.76947021484375, "rewards//std": 0.030076975002884865, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1782, "grad_norm": 3.143575668334961, "kl": 1.7374642584472895, "learning_rate": 9.306131215901003e-07, "loss": 0.1737, "num_tokens": 7701288.0, "reward": 0.75341796875, "reward_std": 0.01818675920367241, "rewards//mean": 0.75341796875, "rewards//std": 0.04244700446724892, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1784, "grad_norm": 2.6063413619995117, "kl": 1.068063260987401, "learning_rate": 9.304517595472039e-07, "loss": 0.1068, "num_tokens": 7709928.0, "reward": 0.77374267578125, "reward_std": 0.007728885859251022, "rewards//mean": 0.77374267578125, "rewards//std": 0.02507800981402397, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1786, "grad_norm": 8.3313570022583, "kl": 2.452501432970166, "learning_rate": 9.302902241183903e-07, "loss": 0.2453, "num_tokens": 7718504.0, "reward": 0.7515869140625, "reward_std": 0.017169872298836708, "rewards//mean": 0.7515869140625, "rewards//std": 0.03167125955224037, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1788, "grad_norm": 11.411910057067871, "kl": 0.8639341052621603, "learning_rate": 9.301285153687259e-07, "loss": 0.0864, "num_tokens": 7727272.0, "reward": 0.767333984375, "reward_std": 0.005527937319129705, "rewards//mean": 0.767333984375, "rewards//std": 0.033479172736406326, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.179, "grad_norm": 21.02414321899414, "kl": 2.8881300818175077, "learning_rate": 9.29966633363347e-07, "loss": 0.2888, "num_tokens": 7736008.0, "reward": 0.736328125, "reward_std": 0.01383928395807743, "rewards//mean": 0.736328125, "rewards//std": 0.04077353700995445, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1792, "grad_norm": 13.486455917358398, "kl": 2.517023626714945, "learning_rate": 9.298045781674595e-07, "loss": 0.2517, "num_tokens": 7744680.0, "reward": 0.74737548828125, "reward_std": 0.012669989839196205, "rewards//mean": 0.74737548828125, "rewards//std": 0.03802602365612984, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1794, "grad_norm": 9.263734817504883, "kl": 1.875015266239643, "learning_rate": 9.296423498463395e-07, "loss": 0.1875, "num_tokens": 7753328.0, "reward": 0.787841796875, "reward_std": 0.016101669520139694, "rewards//mean": 0.787841796875, "rewards//std": 0.0325920507311821, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1796, "grad_norm": 8.410420417785645, "kl": 2.492162285372615, "learning_rate": 9.294799484653322e-07, "loss": 0.2492, "num_tokens": 7761912.0, "reward": 0.75738525390625, "reward_std": 0.013894645497202873, "rewards//mean": 0.75738525390625, "rewards//std": 0.026476258412003517, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1798, "grad_norm": 8.074809074401855, "kl": 2.27861930988729, "learning_rate": 9.29317374089853e-07, "loss": 0.2279, "num_tokens": 7770584.0, "reward": 0.75091552734375, "reward_std": 0.018135320395231247, "rewards//mean": 0.75091552734375, "rewards//std": 0.04076012596487999, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.18, "grad_norm": 9.852544784545898, "kl": 2.704176900908351, "learning_rate": 9.291546267853869e-07, "loss": 0.2704, "num_tokens": 7779176.0, "reward": 0.75152587890625, "reward_std": 0.01635921373963356, "rewards//mean": 0.75152587890625, "rewards//std": 0.03717942163348198, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1802, "grad_norm": 3.585476875305176, "kl": 1.8306819722056389, "learning_rate": 9.289917066174885e-07, "loss": 0.1831, "num_tokens": 7787928.0, "reward": 0.746337890625, "reward_std": 0.013768121600151062, "rewards//mean": 0.746337890625, "rewards//std": 0.031588222831487656, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1804, "grad_norm": 8.426328659057617, "kl": 2.0367767196148634, "learning_rate": 9.288286136517819e-07, "loss": 0.2037, "num_tokens": 7796560.0, "reward": 0.7659912109375, "reward_std": 0.01338636688888073, "rewards//mean": 0.7659912109375, "rewards//std": 0.03998004272580147, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1806, "grad_norm": 18.59994888305664, "kl": 2.681055746972561, "learning_rate": 9.28665347953961e-07, "loss": 0.2681, "num_tokens": 7805264.0, "reward": 0.71942138671875, "reward_std": 0.01512935757637024, "rewards//mean": 0.71942138671875, "rewards//std": 0.03471291437745094, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1808, "grad_norm": 9.62825870513916, "kl": 3.0684708058834076, "learning_rate": 9.285019095897893e-07, "loss": 0.3068, "num_tokens": 7813920.0, "reward": 0.76092529296875, "reward_std": 0.02263367548584938, "rewards//mean": 0.76092529296875, "rewards//std": 0.04002521559596062, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.181, "grad_norm": 15.713327407836914, "kl": 3.0565064027905464, "learning_rate": 9.283382986250996e-07, "loss": 0.3057, "num_tokens": 7822416.0, "reward": 0.70458984375, "reward_std": 0.019932560622692108, "rewards//mean": 0.70458984375, "rewards//std": 0.04498450830578804, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1812, "grad_norm": 4.569295883178711, "kl": 1.8074570018798113, "learning_rate": 9.281745151257945e-07, "loss": 0.1807, "num_tokens": 7831000.0, "reward": 0.72930908203125, "reward_std": 0.012088514864444733, "rewards//mean": 0.72930908203125, "rewards//std": 0.043668653815984726, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1814, "grad_norm": 13.229097366333008, "kl": 2.921083979308605, "learning_rate": 9.280105591578458e-07, "loss": 0.2921, "num_tokens": 7839616.0, "reward": 0.7606201171875, "reward_std": 0.02092359960079193, "rewards//mean": 0.7606201171875, "rewards//std": 0.038079578429460526, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1816, "grad_norm": 12.425844192504883, "kl": 2.733391372486949, "learning_rate": 9.278464307872951e-07, "loss": 0.2733, "num_tokens": 7848272.0, "reward": 0.78143310546875, "reward_std": 0.022195223718881607, "rewards//mean": 0.78143310546875, "rewards//std": 0.04661816731095314, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1818, "grad_norm": 5.109320163726807, "kl": 1.8693589717149734, "learning_rate": 9.276821300802533e-07, "loss": 0.1869, "num_tokens": 7856824.0, "reward": 0.755126953125, "reward_std": 0.018127374351024628, "rewards//mean": 0.755126953125, "rewards//std": 0.028132878243923187, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.182, "grad_norm": 8.810687065124512, "kl": 2.1022141221910715, "learning_rate": 9.275176571029006e-07, "loss": 0.2102, "num_tokens": 7865384.0, "reward": 0.7452392578125, "reward_std": 0.014001820236444473, "rewards//mean": 0.7452392578125, "rewards//std": 0.03700868785381317, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1822, "grad_norm": 4.805698871612549, "kl": 2.360915334895253, "learning_rate": 9.273530119214867e-07, "loss": 0.2361, "num_tokens": 7874016.0, "reward": 0.76861572265625, "reward_std": 0.014025718905031681, "rewards//mean": 0.76861572265625, "rewards//std": 0.03152580186724663, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1824, "grad_norm": 9.826226234436035, "kl": 1.457917682826519, "learning_rate": 9.271881946023308e-07, "loss": 0.1458, "num_tokens": 7882688.0, "reward": 0.78216552734375, "reward_std": 0.019405674189329147, "rewards//mean": 0.78216552734375, "rewards//std": 0.03130995109677315, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1826, "grad_norm": 4.0873188972473145, "kl": 2.3320354279130697, "learning_rate": 9.270232052118212e-07, "loss": 0.2332, "num_tokens": 7891408.0, "reward": 0.75823974609375, "reward_std": 0.02385760098695755, "rewards//mean": 0.75823974609375, "rewards//std": 0.04004298523068428, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1828, "grad_norm": 5.695456027984619, "kl": 1.780807789415121, "learning_rate": 9.268580438164155e-07, "loss": 0.1781, "num_tokens": 7900024.0, "reward": 0.75, "reward_std": 0.01188972033560276, "rewards//mean": 0.75, "rewards//std": 0.03650480881333351, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.183, "grad_norm": 9.227212905883789, "kl": 1.6088938806205988, "learning_rate": 9.266927104826408e-07, "loss": 0.1609, "num_tokens": 7908696.0, "reward": 0.75811767578125, "reward_std": 0.01559586450457573, "rewards//mean": 0.75811767578125, "rewards//std": 0.03580343350768089, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1832, "grad_norm": 7.26616096496582, "kl": 1.1849891766905785, "learning_rate": 9.265272052770935e-07, "loss": 0.1185, "num_tokens": 7917392.0, "reward": 0.77154541015625, "reward_std": 0.012334248051047325, "rewards//mean": 0.77154541015625, "rewards//std": 0.02179904840886593, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1834, "grad_norm": 2.9212043285369873, "kl": 0.9974309261888266, "learning_rate": 9.263615282664388e-07, "loss": 0.0997, "num_tokens": 7926048.0, "reward": 0.7374267578125, "reward_std": 0.008398480713367462, "rewards//mean": 0.7374267578125, "rewards//std": 0.04307553917169571, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1836, "grad_norm": 8.052578926086426, "kl": 2.2471475172787905, "learning_rate": 9.261956795174115e-07, "loss": 0.2247, "num_tokens": 7935016.0, "reward": 0.772705078125, "reward_std": 0.015445258468389511, "rewards//mean": 0.772705078125, "rewards//std": 0.04904589429497719, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1838, "grad_norm": 5.683385848999023, "kl": 1.0640036799013615, "learning_rate": 9.260296590968156e-07, "loss": 0.1064, "num_tokens": 7943656.0, "reward": 0.7342529296875, "reward_std": 0.006155871320515871, "rewards//mean": 0.7342529296875, "rewards//std": 0.029748860746622086, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.184, "grad_norm": 6.42412805557251, "kl": 1.1672633066773415, "learning_rate": 9.258634670715237e-07, "loss": 0.1167, "num_tokens": 7952288.0, "reward": 0.73382568359375, "reward_std": 0.007497087121009827, "rewards//mean": 0.73382568359375, "rewards//std": 0.026583530008792877, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1842, "grad_norm": 5.684611797332764, "kl": 1.2743122726678848, "learning_rate": 9.256971035084784e-07, "loss": 0.1274, "num_tokens": 7960904.0, "reward": 0.74481201171875, "reward_std": 0.01006082259118557, "rewards//mean": 0.74481201171875, "rewards//std": 0.03901320695877075, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1844, "grad_norm": 5.896803855895996, "kl": 1.1679475717246532, "learning_rate": 9.255305684746907e-07, "loss": 0.1168, "num_tokens": 7969416.0, "reward": 0.78533935546875, "reward_std": 0.01715112291276455, "rewards//mean": 0.78533935546875, "rewards//std": 0.03011671081185341, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1846, "grad_norm": 4.434075355529785, "kl": 1.6047652177512646, "learning_rate": 9.253638620372408e-07, "loss": 0.1605, "num_tokens": 7978048.0, "reward": 0.773193359375, "reward_std": 0.017079252749681473, "rewards//mean": 0.773193359375, "rewards//std": 0.03856877237558365, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1848, "grad_norm": 4.12543249130249, "kl": 0.8505661133676767, "learning_rate": 9.251969842632783e-07, "loss": 0.0851, "num_tokens": 7986648.0, "reward": 0.76025390625, "reward_std": 0.009053455665707588, "rewards//mean": 0.76025390625, "rewards//std": 0.02545374259352684, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.185, "grad_norm": 7.275959491729736, "kl": 1.6916682124137878, "learning_rate": 9.250299352200212e-07, "loss": 0.1692, "num_tokens": 7995320.0, "reward": 0.76251220703125, "reward_std": 0.01787765696644783, "rewards//mean": 0.76251220703125, "rewards//std": 0.029016973450779915, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1852, "grad_norm": 26.653705596923828, "kl": 0.981909729540348, "learning_rate": 9.248627149747572e-07, "loss": 0.0982, "num_tokens": 8003952.0, "reward": 0.7724609375, "reward_std": 0.017310626804828644, "rewards//mean": 0.7724609375, "rewards//std": 0.03970210626721382, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1854, "grad_norm": 4.353677749633789, "kl": 2.0300528090447187, "learning_rate": 9.246953235948422e-07, "loss": 0.203, "num_tokens": 8012560.0, "reward": 0.7486572265625, "reward_std": 0.017548371106386185, "rewards//mean": 0.7486572265625, "rewards//std": 0.036816760897636414, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1856, "grad_norm": 3.8817214965820312, "kl": 1.986955901607871, "learning_rate": 9.245277611477018e-07, "loss": 0.1987, "num_tokens": 8021216.0, "reward": 0.767333984375, "reward_std": 0.022250382229685783, "rewards//mean": 0.767333984375, "rewards//std": 0.03420204669237137, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1858, "grad_norm": 10.012185096740723, "kl": 1.9269897807389498, "learning_rate": 9.2436002770083e-07, "loss": 0.1927, "num_tokens": 8029968.0, "reward": 0.74853515625, "reward_std": 0.015564601868391037, "rewards//mean": 0.74853515625, "rewards//std": 0.04699111357331276, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.186, "grad_norm": 14.43603229522705, "kl": 1.2859059367328882, "learning_rate": 9.241921233217897e-07, "loss": 0.1286, "num_tokens": 8038632.0, "reward": 0.76507568359375, "reward_std": 0.009841764345765114, "rewards//mean": 0.76507568359375, "rewards//std": 0.03549472615122795, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1862, "grad_norm": 22.928585052490234, "kl": 2.3845520988106728, "learning_rate": 9.240240480782129e-07, "loss": 0.2385, "num_tokens": 8047248.0, "reward": 0.7322998046875, "reward_std": 0.01873641088604927, "rewards//mean": 0.7322998046875, "rewards//std": 0.03428051620721817, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1864, "grad_norm": 5.155069828033447, "kl": 2.250894131138921, "learning_rate": 9.238558020378003e-07, "loss": 0.2251, "num_tokens": 8055896.0, "reward": 0.72259521484375, "reward_std": 0.01779280975461006, "rewards//mean": 0.72259521484375, "rewards//std": 0.04486586153507233, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1866, "grad_norm": 4.8972320556640625, "kl": 2.1627304777503014, "learning_rate": 9.236873852683212e-07, "loss": 0.2163, "num_tokens": 8064552.0, "reward": 0.7603759765625, "reward_std": 0.022223878651857376, "rewards//mean": 0.7603759765625, "rewards//std": 0.03432464599609375, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1868, "grad_norm": 17.42308807373047, "kl": 2.201874129474163, "learning_rate": 9.235187978376141e-07, "loss": 0.2202, "num_tokens": 8073232.0, "reward": 0.75555419921875, "reward_std": 0.024196408689022064, "rewards//mean": 0.75555419921875, "rewards//std": 0.0436471588909626, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.187, "grad_norm": 4.307592868804932, "kl": 1.4050643611699343, "learning_rate": 9.233500398135858e-07, "loss": 0.1405, "num_tokens": 8081840.0, "reward": 0.74835205078125, "reward_std": 0.016355328261852264, "rewards//mean": 0.74835205078125, "rewards//std": 0.03643045201897621, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1872, "grad_norm": 5.395535469055176, "kl": 2.3265808075666428, "learning_rate": 9.23181111264212e-07, "loss": 0.2327, "num_tokens": 8090544.0, "reward": 0.7430419921875, "reward_std": 0.02195778861641884, "rewards//mean": 0.7430419921875, "rewards//std": 0.03717845305800438, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1874, "grad_norm": 17.81551742553711, "kl": 1.4119716454297304, "learning_rate": 9.230120122575375e-07, "loss": 0.1412, "num_tokens": 8099208.0, "reward": 0.74847412109375, "reward_std": 0.017981721088290215, "rewards//mean": 0.74847412109375, "rewards//std": 0.03422145918011665, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1876, "grad_norm": 9.84168529510498, "kl": 2.9426979944109917, "learning_rate": 9.228427428616748e-07, "loss": 0.2943, "num_tokens": 8107856.0, "reward": 0.74212646484375, "reward_std": 0.015461022034287453, "rewards//mean": 0.74212646484375, "rewards//std": 0.030429234728217125, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1878, "grad_norm": 4.055408000946045, "kl": 1.9178044013679028, "learning_rate": 9.22673303144806e-07, "loss": 0.1918, "num_tokens": 8116456.0, "reward": 0.71600341796875, "reward_std": 0.015704046934843063, "rewards//mean": 0.71600341796875, "rewards//std": 0.04995502531528473, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.188, "grad_norm": 4.922194004058838, "kl": 1.9541615787893534, "learning_rate": 9.22503693175181e-07, "loss": 0.1954, "num_tokens": 8125016.0, "reward": 0.742919921875, "reward_std": 0.015807168558239937, "rewards//mean": 0.742919921875, "rewards//std": 0.04359757527709007, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1882, "grad_norm": 7.407509803771973, "kl": 1.4618559051305056, "learning_rate": 9.223339130211192e-07, "loss": 0.1462, "num_tokens": 8133600.0, "reward": 0.7576904296875, "reward_std": 0.017947595566511154, "rewards//mean": 0.7576904296875, "rewards//std": 0.038944318890571594, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1884, "grad_norm": 16.472244262695312, "kl": 1.2443057876080275, "learning_rate": 9.221639627510075e-07, "loss": 0.1244, "num_tokens": 8142232.0, "reward": 0.76678466796875, "reward_std": 0.018832771107554436, "rewards//mean": 0.76678466796875, "rewards//std": 0.027889082208275795, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1886, "grad_norm": 12.38464641571045, "kl": 1.4479559306055307, "learning_rate": 9.219938424333023e-07, "loss": 0.1448, "num_tokens": 8150976.0, "reward": 0.7391357421875, "reward_std": 0.022428398951888084, "rewards//mean": 0.7391357421875, "rewards//std": 0.03541002422571182, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1888, "grad_norm": 4.915424346923828, "kl": 1.8865959215909243, "learning_rate": 9.218235521365276e-07, "loss": 0.1887, "num_tokens": 8159640.0, "reward": 0.76922607421875, "reward_std": 0.019969366490840912, "rewards//mean": 0.76922607421875, "rewards//std": 0.056940995156764984, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.189, "grad_norm": 8.629729270935059, "kl": 2.373987479135394, "learning_rate": 9.216530919292767e-07, "loss": 0.2374, "num_tokens": 8168288.0, "reward": 0.73284912109375, "reward_std": 0.01817513443529606, "rewards//mean": 0.73284912109375, "rewards//std": 0.04339568316936493, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1892, "grad_norm": 11.080283164978027, "kl": 3.1587901078164577, "learning_rate": 9.214824618802107e-07, "loss": 0.3159, "num_tokens": 8176928.0, "reward": 0.75714111328125, "reward_std": 0.026241883635520935, "rewards//mean": 0.75714111328125, "rewards//std": 0.034221019595861435, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1894, "grad_norm": 9.931846618652344, "kl": 1.0177927780896425, "learning_rate": 9.213116620580596e-07, "loss": 0.1018, "num_tokens": 8185472.0, "reward": 0.7650146484375, "reward_std": 0.011359244585037231, "rewards//mean": 0.7650146484375, "rewards//std": 0.022512100636959076, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1896, "grad_norm": 4.0106520652771, "kl": 1.7773894555866718, "learning_rate": 9.211406925316212e-07, "loss": 0.1777, "num_tokens": 8194096.0, "reward": 0.76605224609375, "reward_std": 0.02363177202641964, "rewards//mean": 0.76605224609375, "rewards//std": 0.043174657970666885, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1898, "grad_norm": 4.636829853057861, "kl": 1.7715414706617594, "learning_rate": 9.209695533697623e-07, "loss": 0.1772, "num_tokens": 8202632.0, "reward": 0.74615478515625, "reward_std": 0.01873505488038063, "rewards//mean": 0.74615478515625, "rewards//std": 0.028077874332666397, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.19, "grad_norm": 6.046106815338135, "kl": 1.8093213103711605, "learning_rate": 9.207982446414177e-07, "loss": 0.1809, "num_tokens": 8211200.0, "reward": 0.76531982421875, "reward_std": 0.01822560280561447, "rewards//mean": 0.76531982421875, "rewards//std": 0.02873282879590988, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1902, "grad_norm": 4.0911545753479, "kl": 2.0045978389680386, "learning_rate": 9.206267664155906e-07, "loss": 0.2005, "num_tokens": 8219872.0, "reward": 0.7706298828125, "reward_std": 0.024520523846149445, "rewards//mean": 0.7706298828125, "rewards//std": 0.04281610995531082, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1904, "grad_norm": 14.912626266479492, "kl": 3.176901113241911, "learning_rate": 9.20455118761352e-07, "loss": 0.3177, "num_tokens": 8228504.0, "reward": 0.7489013671875, "reward_std": 0.017545852810144424, "rewards//mean": 0.7489013671875, "rewards//std": 0.028189590200781822, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1906, "grad_norm": 4.028141498565674, "kl": 1.8206164687871933, "learning_rate": 9.202833017478421e-07, "loss": 0.1821, "num_tokens": 8237192.0, "reward": 0.73333740234375, "reward_std": 0.019356444478034973, "rewards//mean": 0.73333740234375, "rewards//std": 0.036869507282972336, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1908, "grad_norm": 5.144661903381348, "kl": 1.748854050412774, "learning_rate": 9.201113154442683e-07, "loss": 0.1749, "num_tokens": 8245880.0, "reward": 0.72723388671875, "reward_std": 0.011367292143404484, "rewards//mean": 0.72723388671875, "rewards//std": 0.039294663816690445, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.191, "grad_norm": 4.669267177581787, "kl": 1.543099394068122, "learning_rate": 9.199391599199071e-07, "loss": 0.1543, "num_tokens": 8254472.0, "reward": 0.7713623046875, "reward_std": 0.013431857340037823, "rewards//mean": 0.7713623046875, "rewards//std": 0.02971627563238144, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1912, "grad_norm": 9.140859603881836, "kl": 1.8747816868126392, "learning_rate": 9.197668352441023e-07, "loss": 0.1875, "num_tokens": 8263168.0, "reward": 0.75628662109375, "reward_std": 0.01882949098944664, "rewards//mean": 0.75628662109375, "rewards//std": 0.03054145723581314, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1914, "grad_norm": 3.491973400115967, "kl": 1.6943730656057596, "learning_rate": 9.195943414862665e-07, "loss": 0.1694, "num_tokens": 8271784.0, "reward": 0.71514892578125, "reward_std": 0.015275931917130947, "rewards//mean": 0.71514892578125, "rewards//std": 0.04649265855550766, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1916, "grad_norm": 7.8375420570373535, "kl": 2.460888223722577, "learning_rate": 9.194216787158804e-07, "loss": 0.2461, "num_tokens": 8280496.0, "reward": 0.73760986328125, "reward_std": 0.0214063823223114, "rewards//mean": 0.73760986328125, "rewards//std": 0.04603851959109306, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1918, "grad_norm": 6.797418117523193, "kl": 1.7108256202191114, "learning_rate": 9.192488470024919e-07, "loss": 0.1711, "num_tokens": 8289160.0, "reward": 0.7525634765625, "reward_std": 0.013072742149233818, "rewards//mean": 0.7525634765625, "rewards//std": 0.03263916075229645, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.192, "grad_norm": 5.766705513000488, "kl": 2.004494074732065, "learning_rate": 9.190758464157182e-07, "loss": 0.2004, "num_tokens": 8297768.0, "reward": 0.72174072265625, "reward_std": 0.01889863796532154, "rewards//mean": 0.72174072265625, "rewards//std": 0.03785166144371033, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1922, "grad_norm": 6.262502670288086, "kl": 1.8866459857672453, "learning_rate": 9.189026770252436e-07, "loss": 0.1887, "num_tokens": 8306440.0, "reward": 0.74481201171875, "reward_std": 0.023079926148056984, "rewards//mean": 0.74481201171875, "rewards//std": 0.04129328951239586, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1924, "grad_norm": 4.815186977386475, "kl": 1.9248689897358418, "learning_rate": 9.187293389008208e-07, "loss": 0.1925, "num_tokens": 8315072.0, "reward": 0.74517822265625, "reward_std": 0.015207601711153984, "rewards//mean": 0.74517822265625, "rewards//std": 0.03300460800528526, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1926, "grad_norm": 5.999852657318115, "kl": 1.9934442956000566, "learning_rate": 9.185558321122704e-07, "loss": 0.1993, "num_tokens": 8323776.0, "reward": 0.77001953125, "reward_std": 0.024884480983018875, "rewards//mean": 0.77001953125, "rewards//std": 0.04075274243950844, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1928, "grad_norm": 7.562851428985596, "kl": 1.7993676457554102, "learning_rate": 9.183821567294808e-07, "loss": 0.1799, "num_tokens": 8332408.0, "reward": 0.744873046875, "reward_std": 0.02207602560520172, "rewards//mean": 0.744873046875, "rewards//std": 0.033355962485075, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.193, "grad_norm": 7.516448974609375, "kl": 2.334638250991702, "learning_rate": 9.182083128224086e-07, "loss": 0.2335, "num_tokens": 8341096.0, "reward": 0.74591064453125, "reward_std": 0.017029182985424995, "rewards//mean": 0.74591064453125, "rewards//std": 0.032056719064712524, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1932, "grad_norm": 4.7977824211120605, "kl": 2.1065255533903837, "learning_rate": 9.180343004610779e-07, "loss": 0.2107, "num_tokens": 8349760.0, "reward": 0.76287841796875, "reward_std": 0.026503656059503555, "rewards//mean": 0.76287841796875, "rewards//std": 0.03985845670104027, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1934, "grad_norm": 5.4156670570373535, "kl": 1.567462394014001, "learning_rate": 9.178601197155811e-07, "loss": 0.1567, "num_tokens": 8358440.0, "reward": 0.745361328125, "reward_std": 0.013103803619742393, "rewards//mean": 0.745361328125, "rewards//std": 0.03417370840907097, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1936, "grad_norm": 9.369543075561523, "kl": 1.647336831316352, "learning_rate": 9.176857706560779e-07, "loss": 0.1647, "num_tokens": 8367064.0, "reward": 0.7431640625, "reward_std": 0.017528044059872627, "rewards//mean": 0.7431640625, "rewards//std": 0.03703179210424423, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1938, "grad_norm": 3.257838487625122, "kl": 1.6657201033085585, "learning_rate": 9.175112533527963e-07, "loss": 0.1666, "num_tokens": 8375712.0, "reward": 0.72479248046875, "reward_std": 0.011116349138319492, "rewards//mean": 0.72479248046875, "rewards//std": 0.028279326856136322, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.194, "grad_norm": 9.640281677246094, "kl": 2.476398589089513, "learning_rate": 9.173365678760317e-07, "loss": 0.2476, "num_tokens": 8384464.0, "reward": 0.75396728515625, "reward_std": 0.0185113325715065, "rewards//mean": 0.75396728515625, "rewards//std": 0.04257799685001373, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1942, "grad_norm": 8.67280101776123, "kl": 2.546649331226945, "learning_rate": 9.171617142961476e-07, "loss": 0.2547, "num_tokens": 8393096.0, "reward": 0.7593994140625, "reward_std": 0.015201020054519176, "rewards//mean": 0.7593994140625, "rewards//std": 0.034672170877456665, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1944, "grad_norm": 4.121150970458984, "kl": 2.2070589400827885, "learning_rate": 9.169866926835747e-07, "loss": 0.2207, "num_tokens": 8401712.0, "reward": 0.739501953125, "reward_std": 0.020942389965057373, "rewards//mean": 0.739501953125, "rewards//std": 0.03589511662721634, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1946, "grad_norm": 12.351953506469727, "kl": 3.461782954633236, "learning_rate": 9.16811503108812e-07, "loss": 0.3462, "num_tokens": 8410496.0, "reward": 0.73712158203125, "reward_std": 0.019555550068616867, "rewards//mean": 0.73712158203125, "rewards//std": 0.037645939737558365, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1948, "grad_norm": 10.169455528259277, "kl": 1.4090617876499891, "learning_rate": 9.166361456424257e-07, "loss": 0.1409, "num_tokens": 8419064.0, "reward": 0.76898193359375, "reward_std": 0.013797442428767681, "rewards//mean": 0.76898193359375, "rewards//std": 0.03554246202111244, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.195, "grad_norm": 6.749905586242676, "kl": 2.9098270386457443, "learning_rate": 9.164606203550497e-07, "loss": 0.291, "num_tokens": 8427840.0, "reward": 0.764892578125, "reward_std": 0.013962388038635254, "rewards//mean": 0.764892578125, "rewards//std": 0.03703097254037857, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1952, "grad_norm": 9.379358291625977, "kl": 3.743213150650263, "learning_rate": 9.162849273173856e-07, "loss": 0.3743, "num_tokens": 8436616.0, "reward": 0.7681884765625, "reward_std": 0.027786388993263245, "rewards//mean": 0.7681884765625, "rewards//std": 0.04356614127755165, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1954, "grad_norm": 6.3227858543396, "kl": 2.949258007109165, "learning_rate": 9.161090666002027e-07, "loss": 0.2949, "num_tokens": 8445216.0, "reward": 0.76934814453125, "reward_std": 0.023682300001382828, "rewards//mean": 0.76934814453125, "rewards//std": 0.050551459193229675, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1956, "grad_norm": 3.9359500408172607, "kl": 2.529670547693968, "learning_rate": 9.159330382743373e-07, "loss": 0.253, "num_tokens": 8453896.0, "reward": 0.7742919921875, "reward_std": 0.023785840719938278, "rewards//mean": 0.7742919921875, "rewards//std": 0.04157046228647232, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1958, "grad_norm": 7.5341081619262695, "kl": 2.6453016586601734, "learning_rate": 9.157568424106941e-07, "loss": 0.2645, "num_tokens": 8462552.0, "reward": 0.77020263671875, "reward_std": 0.015745025128126144, "rewards//mean": 0.77020263671875, "rewards//std": 0.03167765215039253, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.196, "grad_norm": 3.5417768955230713, "kl": 2.399245113134384, "learning_rate": 9.155804790802443e-07, "loss": 0.2399, "num_tokens": 8471232.0, "reward": 0.75244140625, "reward_std": 0.024859551340341568, "rewards//mean": 0.75244140625, "rewards//std": 0.03393634408712387, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1962, "grad_norm": 3.2599830627441406, "kl": 2.5723649710416794, "learning_rate": 9.154039483540272e-07, "loss": 0.2572, "num_tokens": 8479856.0, "reward": 0.7513427734375, "reward_std": 0.01924026757478714, "rewards//mean": 0.7513427734375, "rewards//std": 0.03221340849995613, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1964, "grad_norm": 11.227360725402832, "kl": 2.852146787568927, "learning_rate": 9.152272503031495e-07, "loss": 0.2852, "num_tokens": 8488528.0, "reward": 0.7349853515625, "reward_std": 0.02431170456111431, "rewards//mean": 0.7349853515625, "rewards//std": 0.05037766695022583, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1966, "grad_norm": 5.53578519821167, "kl": 2.192291097715497, "learning_rate": 9.150503849987851e-07, "loss": 0.2192, "num_tokens": 8497112.0, "reward": 0.7540283203125, "reward_std": 0.01734818145632744, "rewards//mean": 0.7540283203125, "rewards//std": 0.04089198634028435, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1968, "grad_norm": 5.688729763031006, "kl": 2.156856844201684, "learning_rate": 9.14873352512175e-07, "loss": 0.2157, "num_tokens": 8505752.0, "reward": 0.76312255859375, "reward_std": 0.021846525371074677, "rewards//mean": 0.76312255859375, "rewards//std": 0.036679331213235855, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.197, "grad_norm": 3.1646580696105957, "kl": 1.9843303374946117, "learning_rate": 9.146961529146284e-07, "loss": 0.1984, "num_tokens": 8514376.0, "reward": 0.74755859375, "reward_std": 0.015789909288287163, "rewards//mean": 0.74755859375, "rewards//std": 0.04050833731889725, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1972, "grad_norm": 8.397634506225586, "kl": 1.9697171039879322, "learning_rate": 9.145187862775208e-07, "loss": 0.197, "num_tokens": 8522960.0, "reward": 0.7210693359375, "reward_std": 0.011889157816767693, "rewards//mean": 0.7210693359375, "rewards//std": 0.040503665804862976, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1974, "grad_norm": 2.9615468978881836, "kl": 2.1096298955380917, "learning_rate": 9.143412526722958e-07, "loss": 0.211, "num_tokens": 8531496.0, "reward": 0.71539306640625, "reward_std": 0.011229978874325752, "rewards//mean": 0.71539306640625, "rewards//std": 0.035827524960041046, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1976, "grad_norm": 5.069641590118408, "kl": 3.2557414285838604, "learning_rate": 9.141635521704636e-07, "loss": 0.3256, "num_tokens": 8540232.0, "reward": 0.74566650390625, "reward_std": 0.02486158348619938, "rewards//mean": 0.74566650390625, "rewards//std": 0.04670478776097298, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1978, "grad_norm": 3.38765549659729, "kl": 2.23440158367157, "learning_rate": 9.139856848436023e-07, "loss": 0.2234, "num_tokens": 8548920.0, "reward": 0.7530517578125, "reward_std": 0.01635737717151642, "rewards//mean": 0.7530517578125, "rewards//std": 0.03826991468667984, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.198, "grad_norm": 10.754985809326172, "kl": 2.968583047389984, "learning_rate": 9.138076507633565e-07, "loss": 0.2969, "num_tokens": 8557688.0, "reward": 0.76715087890625, "reward_std": 0.02133917436003685, "rewards//mean": 0.76715087890625, "rewards//std": 0.041763149201869965, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1982, "grad_norm": 4.791354656219482, "kl": 2.0338217727839947, "learning_rate": 9.136294500014385e-07, "loss": 0.2034, "num_tokens": 8566352.0, "reward": 0.74432373046875, "reward_std": 0.02426404505968094, "rewards//mean": 0.74432373046875, "rewards//std": 0.04016867280006409, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1984, "grad_norm": 6.58543062210083, "kl": 2.2268502674996853, "learning_rate": 9.134510826296276e-07, "loss": 0.2227, "num_tokens": 8575064.0, "reward": 0.73406982421875, "reward_std": 0.014979375526309013, "rewards//mean": 0.73406982421875, "rewards//std": 0.03583724424242973, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1986, "grad_norm": 7.094622611999512, "kl": 1.471730774268508, "learning_rate": 9.1327254871977e-07, "loss": 0.1472, "num_tokens": 8583664.0, "reward": 0.7391357421875, "reward_std": 0.016242019832134247, "rewards//mean": 0.7391357421875, "rewards//std": 0.03274473547935486, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1988, "grad_norm": 5.462141990661621, "kl": 1.950741233304143, "learning_rate": 9.130938483437791e-07, "loss": 0.1951, "num_tokens": 8592344.0, "reward": 0.7523193359375, "reward_std": 0.017355646938085556, "rewards//mean": 0.7523193359375, "rewards//std": 0.03225473314523697, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.199, "grad_norm": 3.3840034008026123, "kl": 2.0983388610184193, "learning_rate": 9.129149815736357e-07, "loss": 0.2098, "num_tokens": 8601024.0, "reward": 0.72772216796875, "reward_std": 0.018127836287021637, "rewards//mean": 0.72772216796875, "rewards//std": 0.04553665593266487, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1992, "grad_norm": 6.309532642364502, "kl": 1.3534683883190155, "learning_rate": 9.12735948481387e-07, "loss": 0.1353, "num_tokens": 8609688.0, "reward": 0.7733154296875, "reward_std": 0.016825225204229355, "rewards//mean": 0.7733154296875, "rewards//std": 0.03628499060869217, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1994, "grad_norm": 6.512838363647461, "kl": 1.433094348758459, "learning_rate": 9.125567491391475e-07, "loss": 0.1433, "num_tokens": 8618400.0, "reward": 0.7647705078125, "reward_std": 0.016744688153266907, "rewards//mean": 0.7647705078125, "rewards//std": 0.03630167245864868, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1996, "grad_norm": 6.351593494415283, "kl": 1.7570994179695845, "learning_rate": 9.123773836190989e-07, "loss": 0.1757, "num_tokens": 8627216.0, "reward": 0.76605224609375, "reward_std": 0.014316737651824951, "rewards//mean": 0.76605224609375, "rewards//std": 0.03454899787902832, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.1998, "grad_norm": 3.224513053894043, "kl": 1.7378581073135138, "learning_rate": 9.121978519934895e-07, "loss": 0.1738, "num_tokens": 8635816.0, "reward": 0.71142578125, "reward_std": 0.009469851851463318, "rewards//mean": 0.71142578125, "rewards//std": 0.03517002612352371, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2, "grad_norm": 3.5807652473449707, "kl": 2.5934614650905132, "learning_rate": 9.120181543346346e-07, "loss": 0.2593, "num_tokens": 8644496.0, "reward": 0.74176025390625, "reward_std": 0.027724089100956917, "rewards//mean": 0.74176025390625, "rewards//std": 0.04741503298282623, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2002, "grad_norm": 5.258052825927734, "kl": 0.9394328705966473, "learning_rate": 9.118382907149163e-07, "loss": 0.0939, "num_tokens": 8653088.0, "reward": 0.75982666015625, "reward_std": 0.01254335604608059, "rewards//mean": 0.75982666015625, "rewards//std": 0.033013783395290375, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2004, "grad_norm": 3.4401493072509766, "kl": 1.7041137032210827, "learning_rate": 9.116582612067838e-07, "loss": 0.1704, "num_tokens": 8661776.0, "reward": 0.7347412109375, "reward_std": 0.01181069016456604, "rewards//mean": 0.7347412109375, "rewards//std": 0.03357917442917824, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2006, "grad_norm": 4.4879608154296875, "kl": 1.4810854904353619, "learning_rate": 9.11478065882753e-07, "loss": 0.1481, "num_tokens": 8670552.0, "reward": 0.7659912109375, "reward_std": 0.019971122965216637, "rewards//mean": 0.7659912109375, "rewards//std": 0.038880523294210434, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2008, "grad_norm": 3.7709860801696777, "kl": 2.517193468287587, "learning_rate": 9.112977048154064e-07, "loss": 0.2517, "num_tokens": 8679200.0, "reward": 0.7618408203125, "reward_std": 0.023658381775021553, "rewards//mean": 0.7618408203125, "rewards//std": 0.03793619945645332, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.201, "grad_norm": 3.0166332721710205, "kl": 1.6556002162396908, "learning_rate": 9.111171780773936e-07, "loss": 0.1656, "num_tokens": 8687768.0, "reward": 0.74688720703125, "reward_std": 0.01199406012892723, "rewards//mean": 0.74688720703125, "rewards//std": 0.03949524834752083, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2012, "grad_norm": 10.680081367492676, "kl": 2.370729196816683, "learning_rate": 9.109364857414305e-07, "loss": 0.2371, "num_tokens": 8696448.0, "reward": 0.7003173828125, "reward_std": 0.029017120599746704, "rewards//mean": 0.7003173828125, "rewards//std": 0.050186194479465485, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2014, "grad_norm": 3.0061981678009033, "kl": 1.3387518040835857, "learning_rate": 9.107556278803002e-07, "loss": 0.1339, "num_tokens": 8705088.0, "reward": 0.75897216796875, "reward_std": 0.008711813017725945, "rewards//mean": 0.75897216796875, "rewards//std": 0.02459406480193138, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2016, "grad_norm": 4.439276695251465, "kl": 2.059834100306034, "learning_rate": 9.10574604566852e-07, "loss": 0.206, "num_tokens": 8713744.0, "reward": 0.70880126953125, "reward_std": 0.013922769576311111, "rewards//mean": 0.70880126953125, "rewards//std": 0.03169580549001694, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2018, "grad_norm": 3.0243308544158936, "kl": 1.7723627705127, "learning_rate": 9.103934158740022e-07, "loss": 0.1772, "num_tokens": 8722456.0, "reward": 0.776611328125, "reward_std": 0.016336709260940552, "rewards//mean": 0.776611328125, "rewards//std": 0.030835507437586784, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.202, "grad_norm": 4.09769344329834, "kl": 2.093655541539192, "learning_rate": 9.102120618747336e-07, "loss": 0.2094, "num_tokens": 8731136.0, "reward": 0.75537109375, "reward_std": 0.01660916581749916, "rewards//mean": 0.75537109375, "rewards//std": 0.03684491664171219, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2022, "grad_norm": 5.781892776489258, "kl": 1.4492920245975256, "learning_rate": 9.100305426420956e-07, "loss": 0.1449, "num_tokens": 8739872.0, "reward": 0.76702880859375, "reward_std": 0.01709713786840439, "rewards//mean": 0.76702880859375, "rewards//std": 0.027030762284994125, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2024, "grad_norm": 5.448172569274902, "kl": 1.5353701952844858, "learning_rate": 9.098488582492039e-07, "loss": 0.1535, "num_tokens": 8748488.0, "reward": 0.76947021484375, "reward_std": 0.017640870064496994, "rewards//mean": 0.76947021484375, "rewards//std": 0.03162360563874245, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2026, "grad_norm": 4.6841936111450195, "kl": 1.4715809114277363, "learning_rate": 9.096670087692411e-07, "loss": 0.1472, "num_tokens": 8757128.0, "reward": 0.76654052734375, "reward_std": 0.017565356567502022, "rewards//mean": 0.76654052734375, "rewards//std": 0.036096084862947464, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2028, "grad_norm": 3.725461959838867, "kl": 1.6743265595287085, "learning_rate": 9.094849942754563e-07, "loss": 0.1674, "num_tokens": 8765896.0, "reward": 0.72198486328125, "reward_std": 0.011678006500005722, "rewards//mean": 0.72198486328125, "rewards//std": 0.044230084866285324, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.203, "grad_norm": 6.629767894744873, "kl": 2.057219333946705, "learning_rate": 9.093028148411648e-07, "loss": 0.2057, "num_tokens": 8774560.0, "reward": 0.77020263671875, "reward_std": 0.015884580090641975, "rewards//mean": 0.77020263671875, "rewards//std": 0.031578097492456436, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2032, "grad_norm": 3.7993109226226807, "kl": 2.157150162383914, "learning_rate": 9.091204705397483e-07, "loss": 0.2157, "num_tokens": 8783240.0, "reward": 0.75518798828125, "reward_std": 0.01860562339425087, "rewards//mean": 0.75518798828125, "rewards//std": 0.033516623079776764, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2034, "grad_norm": 7.185744762420654, "kl": 1.5592702366411686, "learning_rate": 9.089379614446553e-07, "loss": 0.1559, "num_tokens": 8791904.0, "reward": 0.76171875, "reward_std": 0.014105849899351597, "rewards//mean": 0.76171875, "rewards//std": 0.03168105334043503, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2036, "grad_norm": 3.291555166244507, "kl": 1.2108670603483915, "learning_rate": 9.087552876294002e-07, "loss": 0.1211, "num_tokens": 8800440.0, "reward": 0.753173828125, "reward_std": 0.010028994642198086, "rewards//mean": 0.753173828125, "rewards//std": 0.02954385057091713, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2038, "grad_norm": 3.1324665546417236, "kl": 1.5204577669501305, "learning_rate": 9.085724491675642e-07, "loss": 0.152, "num_tokens": 8809040.0, "reward": 0.7515869140625, "reward_std": 0.010083088651299477, "rewards//mean": 0.7515869140625, "rewards//std": 0.03368719294667244, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.204, "grad_norm": 4.665890693664551, "kl": 1.2971952389925718, "learning_rate": 9.083894461327945e-07, "loss": 0.1297, "num_tokens": 8817712.0, "reward": 0.76116943359375, "reward_std": 0.008947715163230896, "rewards//mean": 0.76116943359375, "rewards//std": 0.039517853409051895, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2042, "grad_norm": 9.229904174804688, "kl": 1.8431557640433311, "learning_rate": 9.082062785988048e-07, "loss": 0.1843, "num_tokens": 8826296.0, "reward": 0.7672119140625, "reward_std": 0.012240275740623474, "rewards//mean": 0.7672119140625, "rewards//std": 0.028397180140018463, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2044, "grad_norm": 4.694721221923828, "kl": 1.4546211212873459, "learning_rate": 9.080229466393749e-07, "loss": 0.1455, "num_tokens": 8835024.0, "reward": 0.77215576171875, "reward_std": 0.01088004745543003, "rewards//mean": 0.77215576171875, "rewards//std": 0.02693820185959339, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2046, "grad_norm": 3.1893560886383057, "kl": 1.9206580389291048, "learning_rate": 9.078394503283508e-07, "loss": 0.1921, "num_tokens": 8843712.0, "reward": 0.73529052734375, "reward_std": 0.009692528285086155, "rewards//mean": 0.73529052734375, "rewards//std": 0.02680017240345478, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2048, "grad_norm": 2.479396343231201, "kl": 1.483644813299179, "learning_rate": 9.076557897396451e-07, "loss": 0.1484, "num_tokens": 8852368.0, "reward": 0.7568359375, "reward_std": 0.01286611519753933, "rewards//mean": 0.7568359375, "rewards//std": 0.029911501333117485, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.205, "grad_norm": 2.945749044418335, "kl": 0.9529800787568092, "learning_rate": 9.074719649472357e-07, "loss": 0.0953, "num_tokens": 8860928.0, "reward": 0.736328125, "reward_std": 0.005859419237822294, "rewards//mean": 0.736328125, "rewards//std": 0.028476230800151825, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2052, "grad_norm": 6.230681419372559, "kl": 1.6063654609024525, "learning_rate": 9.072879760251679e-07, "loss": 0.1606, "num_tokens": 8869624.0, "reward": 0.74969482421875, "reward_std": 0.012421256862580776, "rewards//mean": 0.74969482421875, "rewards//std": 0.031919483095407486, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2054, "grad_norm": 1.848879337310791, "kl": 1.7765672486275434, "learning_rate": 9.071038230475519e-07, "loss": 0.1777, "num_tokens": 8878224.0, "reward": 0.7886962890625, "reward_std": 0.012133300304412842, "rewards//mean": 0.7886962890625, "rewards//std": 0.030070748180150986, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2056, "grad_norm": 10.26515007019043, "kl": 1.5696597695350647, "learning_rate": 9.069195060885646e-07, "loss": 0.157, "num_tokens": 8886832.0, "reward": 0.7757568359375, "reward_std": 0.012210506945848465, "rewards//mean": 0.7757568359375, "rewards//std": 0.027906784787774086, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2058, "grad_norm": 7.927394390106201, "kl": 2.835561953485012, "learning_rate": 9.067350252224489e-07, "loss": 0.2836, "num_tokens": 8895536.0, "reward": 0.74676513671875, "reward_std": 0.015318479388952255, "rewards//mean": 0.74676513671875, "rewards//std": 0.02599097415804863, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.206, "grad_norm": 3.9940266609191895, "kl": 1.416461167857051, "learning_rate": 9.065503805235137e-07, "loss": 0.1416, "num_tokens": 8904104.0, "reward": 0.7501220703125, "reward_std": 0.009496974758803844, "rewards//mean": 0.7501220703125, "rewards//std": 0.031350355595350266, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2062, "grad_norm": 2.798651695251465, "kl": 1.3074674978852272, "learning_rate": 9.06365572066134e-07, "loss": 0.1307, "num_tokens": 8912784.0, "reward": 0.76141357421875, "reward_std": 0.007983425632119179, "rewards//mean": 0.76141357421875, "rewards//std": 0.020591329783201218, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2064, "grad_norm": 12.279181480407715, "kl": 3.5611584540456533, "learning_rate": 9.061805999247503e-07, "loss": 0.3561, "num_tokens": 8921536.0, "reward": 0.77001953125, "reward_std": 0.021788431331515312, "rewards//mean": 0.77001953125, "rewards//std": 0.04297438636422157, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2066, "grad_norm": 8.397802352905273, "kl": 2.812007764354348, "learning_rate": 9.059954641738697e-07, "loss": 0.2812, "num_tokens": 8930136.0, "reward": 0.72967529296875, "reward_std": 0.011716771870851517, "rewards//mean": 0.72967529296875, "rewards//std": 0.03453541174530983, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2068, "grad_norm": 8.99998950958252, "kl": 2.567307475954294, "learning_rate": 9.058101648880645e-07, "loss": 0.2567, "num_tokens": 8938864.0, "reward": 0.777099609375, "reward_std": 0.010260752402245998, "rewards//mean": 0.777099609375, "rewards//std": 0.026161137968301773, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.207, "grad_norm": 2.230818033218384, "kl": 1.468862995505333, "learning_rate": 9.056247021419734e-07, "loss": 0.1469, "num_tokens": 8947568.0, "reward": 0.7591552734375, "reward_std": 0.010790163651108742, "rewards//mean": 0.7591552734375, "rewards//std": 0.028401443734765053, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2072, "grad_norm": 7.6791558265686035, "kl": 2.5379223749041557, "learning_rate": 9.054390760103009e-07, "loss": 0.2538, "num_tokens": 8956272.0, "reward": 0.75360107421875, "reward_std": 0.012005605734884739, "rewards//mean": 0.75360107421875, "rewards//std": 0.02732429839670658, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2074, "grad_norm": 2.4393842220306396, "kl": 1.5688249077647924, "learning_rate": 9.052532865678171e-07, "loss": 0.1569, "num_tokens": 8964968.0, "reward": 0.7652587890625, "reward_std": 0.00900060124695301, "rewards//mean": 0.7652587890625, "rewards//std": 0.03030342608690262, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2076, "grad_norm": 2.1573307514190674, "kl": 1.6273485254496336, "learning_rate": 9.050673338893577e-07, "loss": 0.1627, "num_tokens": 8973680.0, "reward": 0.73858642578125, "reward_std": 0.009875812567770481, "rewards//mean": 0.73858642578125, "rewards//std": 0.031805459409952164, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2078, "grad_norm": 2.5634870529174805, "kl": 2.4672775603830814, "learning_rate": 9.04881218049825e-07, "loss": 0.2467, "num_tokens": 8982352.0, "reward": 0.7288818359375, "reward_std": 0.014227893203496933, "rewards//mean": 0.7288818359375, "rewards//std": 0.032494135200977325, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.208, "grad_norm": 2.005077838897705, "kl": 0.9139247164130211, "learning_rate": 9.046949391241858e-07, "loss": 0.0914, "num_tokens": 8991008.0, "reward": 0.72296142578125, "reward_std": 0.004710361361503601, "rewards//mean": 0.72296142578125, "rewards//std": 0.031182534992694855, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2082, "grad_norm": 8.661205291748047, "kl": 2.7401093523949385, "learning_rate": 9.045084971874737e-07, "loss": 0.274, "num_tokens": 8999736.0, "reward": 0.7806396484375, "reward_std": 0.01274215430021286, "rewards//mean": 0.7806396484375, "rewards//std": 0.030058663338422775, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2084, "grad_norm": 13.921006202697754, "kl": 2.183920970186591, "learning_rate": 9.043218923147873e-07, "loss": 0.2184, "num_tokens": 9008384.0, "reward": 0.73284912109375, "reward_std": 0.008773128502070904, "rewards//mean": 0.73284912109375, "rewards//std": 0.04187897592782974, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2086, "grad_norm": 5.314465045928955, "kl": 1.8124338928610086, "learning_rate": 9.04135124581291e-07, "loss": 0.1812, "num_tokens": 9017096.0, "reward": 0.75048828125, "reward_std": 0.01586126536130905, "rewards//mean": 0.75048828125, "rewards//std": 0.027511531487107277, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2088, "grad_norm": 5.20868444442749, "kl": 3.3300900626927614, "learning_rate": 9.039481940622146e-07, "loss": 0.333, "num_tokens": 9025688.0, "reward": 0.75067138671875, "reward_std": 0.017955124378204346, "rewards//mean": 0.75067138671875, "rewards//std": 0.02610255777835846, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.209, "grad_norm": 5.6909356117248535, "kl": 2.0218575745821, "learning_rate": 9.037611008328543e-07, "loss": 0.2022, "num_tokens": 9034344.0, "reward": 0.7503662109375, "reward_std": 0.013804212212562561, "rewards//mean": 0.7503662109375, "rewards//std": 0.0436466783285141, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2092, "grad_norm": 4.027197360992432, "kl": 2.3590909838676453, "learning_rate": 9.035738449685706e-07, "loss": 0.2359, "num_tokens": 9043032.0, "reward": 0.74188232421875, "reward_std": 0.01404221449047327, "rewards//mean": 0.74188232421875, "rewards//std": 0.04413003474473953, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2094, "grad_norm": 2.821146011352539, "kl": 1.6312313880771399, "learning_rate": 9.033864265447906e-07, "loss": 0.1631, "num_tokens": 9051672.0, "reward": 0.7440185546875, "reward_std": 0.01578584499657154, "rewards//mean": 0.7440185546875, "rewards//std": 0.03487589955329895, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2096, "grad_norm": 2.958963632583618, "kl": 1.6689205151051283, "learning_rate": 9.031988456370061e-07, "loss": 0.1669, "num_tokens": 9060280.0, "reward": 0.71722412109375, "reward_std": 0.013462478294968605, "rewards//mean": 0.71722412109375, "rewards//std": 0.02801850624382496, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2098, "grad_norm": 10.404927253723145, "kl": 2.314605975523591, "learning_rate": 9.030111023207749e-07, "loss": 0.2315, "num_tokens": 9068992.0, "reward": 0.73309326171875, "reward_std": 0.010471160523593426, "rewards//mean": 0.73309326171875, "rewards//std": 0.04128742218017578, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.21, "grad_norm": 4.192626476287842, "kl": 2.7768554501235485, "learning_rate": 9.028231966717198e-07, "loss": 0.2777, "num_tokens": 9077648.0, "reward": 0.75421142578125, "reward_std": 0.016651127487421036, "rewards//mean": 0.75421142578125, "rewards//std": 0.03971616178750992, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2102, "grad_norm": 6.3300604820251465, "kl": 2.7328280713409185, "learning_rate": 9.026351287655293e-07, "loss": 0.2733, "num_tokens": 9086304.0, "reward": 0.7623291015625, "reward_std": 0.018146753311157227, "rewards//mean": 0.7623291015625, "rewards//std": 0.036233220249414444, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2104, "grad_norm": 5.513199806213379, "kl": 1.0714191440492868, "learning_rate": 9.02446898677957e-07, "loss": 0.1071, "num_tokens": 9094952.0, "reward": 0.78021240234375, "reward_std": 0.006924469955265522, "rewards//mean": 0.78021240234375, "rewards//std": 0.021868381649255753, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2106, "grad_norm": 4.735969066619873, "kl": 1.3821017984300852, "learning_rate": 9.02258506484822e-07, "loss": 0.1382, "num_tokens": 9103560.0, "reward": 0.763427734375, "reward_std": 0.012097623199224472, "rewards//mean": 0.763427734375, "rewards//std": 0.031218014657497406, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2108, "grad_norm": 5.088438034057617, "kl": 1.687829440459609, "learning_rate": 9.02069952262009e-07, "loss": 0.1688, "num_tokens": 9112192.0, "reward": 0.7362060546875, "reward_std": 0.008893121033906937, "rewards//mean": 0.7362060546875, "rewards//std": 0.04226670414209366, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.211, "grad_norm": 9.628849983215332, "kl": 3.9030132219195366, "learning_rate": 9.018812360854671e-07, "loss": 0.3903, "num_tokens": 9120840.0, "reward": 0.74041748046875, "reward_std": 0.02466106228530407, "rewards//mean": 0.74041748046875, "rewards//std": 0.03548832982778549, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2112, "grad_norm": 9.033496856689453, "kl": 1.4859026093035936, "learning_rate": 9.016923580312113e-07, "loss": 0.1486, "num_tokens": 9129488.0, "reward": 0.76214599609375, "reward_std": 0.012279321439564228, "rewards//mean": 0.76214599609375, "rewards//std": 0.029521232470870018, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2114, "grad_norm": 3.2272932529449463, "kl": 2.0986061356961727, "learning_rate": 9.015033181753218e-07, "loss": 0.2099, "num_tokens": 9138040.0, "reward": 0.745361328125, "reward_std": 0.020388251170516014, "rewards//mean": 0.745361328125, "rewards//std": 0.041694995015859604, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2116, "grad_norm": 2.655343532562256, "kl": 2.4686303231865168, "learning_rate": 9.013141165939438e-07, "loss": 0.2469, "num_tokens": 9146648.0, "reward": 0.771484375, "reward_std": 0.02447417750954628, "rewards//mean": 0.771484375, "rewards//std": 0.03653796762228012, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2118, "grad_norm": 4.1452317237854, "kl": 1.6574193220585585, "learning_rate": 9.011247533632875e-07, "loss": 0.1657, "num_tokens": 9155216.0, "reward": 0.74169921875, "reward_std": 0.009907124564051628, "rewards//mean": 0.74169921875, "rewards//std": 0.030205607414245605, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.212, "grad_norm": 2.804823875427246, "kl": 1.7262969482690096, "learning_rate": 9.009352285596285e-07, "loss": 0.1726, "num_tokens": 9163848.0, "reward": 0.7520751953125, "reward_std": 0.01055437233299017, "rewards//mean": 0.7520751953125, "rewards//std": 0.022412359714508057, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2122, "grad_norm": 11.700467109680176, "kl": 2.8435444589704275, "learning_rate": 9.007455422593075e-07, "loss": 0.2844, "num_tokens": 9172520.0, "reward": 0.76397705078125, "reward_std": 0.015800870954990387, "rewards//mean": 0.76397705078125, "rewards//std": 0.042162489145994186, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2124, "grad_norm": 6.289818286895752, "kl": 1.9879839420318604, "learning_rate": 9.0055569453873e-07, "loss": 0.1988, "num_tokens": 9181096.0, "reward": 0.75042724609375, "reward_std": 0.01895938068628311, "rewards//mean": 0.75042724609375, "rewards//std": 0.03131284937262535, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2126, "grad_norm": 2.4594168663024902, "kl": 2.264555150642991, "learning_rate": 9.003656854743666e-07, "loss": 0.2265, "num_tokens": 9189752.0, "reward": 0.71282958984375, "reward_std": 0.01371677964925766, "rewards//mean": 0.71282958984375, "rewards//std": 0.04367593303322792, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2128, "grad_norm": 5.355173110961914, "kl": 1.163853295147419, "learning_rate": 9.00175515142753e-07, "loss": 0.1164, "num_tokens": 9198392.0, "reward": 0.7357177734375, "reward_std": 0.013257784768939018, "rewards//mean": 0.7357177734375, "rewards//std": 0.036836493760347366, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.213, "grad_norm": 7.627799034118652, "kl": 2.075035708025098, "learning_rate": 8.9998518362049e-07, "loss": 0.2075, "num_tokens": 9207120.0, "reward": 0.74114990234375, "reward_std": 0.016323495656251907, "rewards//mean": 0.74114990234375, "rewards//std": 0.03852081298828125, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2132, "grad_norm": 2.5370049476623535, "kl": 1.1113354787230492, "learning_rate": 8.997946909842424e-07, "loss": 0.1111, "num_tokens": 9215808.0, "reward": 0.77667236328125, "reward_std": 0.007781160529702902, "rewards//mean": 0.77667236328125, "rewards//std": 0.024024909362196922, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2134, "grad_norm": 3.582096815109253, "kl": 2.111095203086734, "learning_rate": 8.996040373107414e-07, "loss": 0.2111, "num_tokens": 9224600.0, "reward": 0.7498779296875, "reward_std": 0.013851397670805454, "rewards//mean": 0.7498779296875, "rewards//std": 0.03584172949194908, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2136, "grad_norm": 6.173175811767578, "kl": 2.086049735546112, "learning_rate": 8.994132226767819e-07, "loss": 0.2086, "num_tokens": 9233272.0, "reward": 0.76031494140625, "reward_std": 0.018188945949077606, "rewards//mean": 0.76031494140625, "rewards//std": 0.04558815062046051, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2138, "grad_norm": 11.23176383972168, "kl": 2.007845725864172, "learning_rate": 8.992222471592239e-07, "loss": 0.2008, "num_tokens": 9241864.0, "reward": 0.7625732421875, "reward_std": 0.01386364083737135, "rewards//mean": 0.7625732421875, "rewards//std": 0.03949088603258133, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.214, "grad_norm": 4.25261926651001, "kl": 1.6897999960929155, "learning_rate": 8.990311108349926e-07, "loss": 0.169, "num_tokens": 9250568.0, "reward": 0.75799560546875, "reward_std": 0.015520873479545116, "rewards//mean": 0.75799560546875, "rewards//std": 0.04118390008807182, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2142, "grad_norm": 4.411772727966309, "kl": 2.255348764359951, "learning_rate": 8.988398137810776e-07, "loss": 0.2255, "num_tokens": 9259280.0, "reward": 0.74029541015625, "reward_std": 0.011341812089085579, "rewards//mean": 0.74029541015625, "rewards//std": 0.019697854295372963, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2144, "grad_norm": 6.922074317932129, "kl": 1.9375630132853985, "learning_rate": 8.986483560745333e-07, "loss": 0.1938, "num_tokens": 9267928.0, "reward": 0.74188232421875, "reward_std": 0.010026191361248493, "rewards//mean": 0.74188232421875, "rewards//std": 0.02257748320698738, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2146, "grad_norm": 5.813230991363525, "kl": 2.6822686679661274, "learning_rate": 8.984567377924789e-07, "loss": 0.2682, "num_tokens": 9276600.0, "reward": 0.7318115234375, "reward_std": 0.020570648834109306, "rewards//mean": 0.7318115234375, "rewards//std": 0.03699395805597305, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2148, "grad_norm": 4.24099588394165, "kl": 1.593309286981821, "learning_rate": 8.982649590120981e-07, "loss": 0.1593, "num_tokens": 9285168.0, "reward": 0.767578125, "reward_std": 0.016599806025624275, "rewards//mean": 0.767578125, "rewards//std": 0.04181753098964691, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.215, "grad_norm": 30.643138885498047, "kl": 1.741103883832693, "learning_rate": 8.980730198106394e-07, "loss": 0.1741, "num_tokens": 9293880.0, "reward": 0.71392822265625, "reward_std": 0.007119806483387947, "rewards//mean": 0.71392822265625, "rewards//std": 0.03858049958944321, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2152, "grad_norm": 33.59050750732422, "kl": 2.6876457687467337, "learning_rate": 8.97880920265416e-07, "loss": 0.2688, "num_tokens": 9302696.0, "reward": 0.75274658203125, "reward_std": 0.012646064162254333, "rewards//mean": 0.75274658203125, "rewards//std": 0.035677216947078705, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2154, "grad_norm": 24.11741065979004, "kl": 3.5284326169639826, "learning_rate": 8.976886604538055e-07, "loss": 0.3528, "num_tokens": 9311360.0, "reward": 0.75982666015625, "reward_std": 0.022194216027855873, "rewards//mean": 0.75982666015625, "rewards//std": 0.04469548910856247, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2156, "grad_norm": 11.08321475982666, "kl": 2.338799251243472, "learning_rate": 8.974962404532501e-07, "loss": 0.2339, "num_tokens": 9320040.0, "reward": 0.76312255859375, "reward_std": 0.012554142624139786, "rewards//mean": 0.76312255859375, "rewards//std": 0.03452971577644348, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2158, "grad_norm": 27.886621475219727, "kl": 1.4472415745258331, "learning_rate": 8.973036603412566e-07, "loss": 0.1447, "num_tokens": 9328624.0, "reward": 0.7734375, "reward_std": 0.01772221550345421, "rewards//mean": 0.7734375, "rewards//std": 0.03383985534310341, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.216, "grad_norm": 8.311849594116211, "kl": 1.8753036558628082, "learning_rate": 8.971109201953962e-07, "loss": 0.1875, "num_tokens": 9337216.0, "reward": 0.759521484375, "reward_std": 0.022103890776634216, "rewards//mean": 0.759521484375, "rewards//std": 0.033681128174066544, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2162, "grad_norm": 4.862329006195068, "kl": 1.6421557180583477, "learning_rate": 8.969180200933047e-07, "loss": 0.1642, "num_tokens": 9345800.0, "reward": 0.75360107421875, "reward_std": 0.0184025838971138, "rewards//mean": 0.75360107421875, "rewards//std": 0.03356897458434105, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2164, "grad_norm": 3.538501739501953, "kl": 1.4810181353241205, "learning_rate": 8.967249601126821e-07, "loss": 0.1481, "num_tokens": 9354368.0, "reward": 0.75732421875, "reward_std": 0.016151435673236847, "rewards//mean": 0.75732421875, "rewards//std": 0.04505982622504234, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2166, "grad_norm": 26.427112579345703, "kl": 1.828379400074482, "learning_rate": 8.96531740331293e-07, "loss": 0.1828, "num_tokens": 9363000.0, "reward": 0.76458740234375, "reward_std": 0.01369639951735735, "rewards//mean": 0.76458740234375, "rewards//std": 0.03298625722527504, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2168, "grad_norm": 12.101463317871094, "kl": 1.2811105847358704, "learning_rate": 8.963383608269663e-07, "loss": 0.1281, "num_tokens": 9371672.0, "reward": 0.75506591796875, "reward_std": 0.009559770114719868, "rewards//mean": 0.75506591796875, "rewards//std": 0.02435656450688839, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.217, "grad_norm": 12.138514518737793, "kl": 2.161335153505206, "learning_rate": 8.961448216775953e-07, "loss": 0.2161, "num_tokens": 9380344.0, "reward": 0.73681640625, "reward_std": 0.009760679677128792, "rewards//mean": 0.73681640625, "rewards//std": 0.026083486154675484, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2172, "grad_norm": 24.755420684814453, "kl": 2.8088847771286964, "learning_rate": 8.959511229611375e-07, "loss": 0.2809, "num_tokens": 9389040.0, "reward": 0.75531005859375, "reward_std": 0.014149040915071964, "rewards//mean": 0.75531005859375, "rewards//std": 0.02670624479651451, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2174, "grad_norm": 10.738611221313477, "kl": 2.352067621424794, "learning_rate": 8.957572647556147e-07, "loss": 0.2352, "num_tokens": 9397592.0, "reward": 0.7427978515625, "reward_std": 0.01083272136747837, "rewards//mean": 0.7427978515625, "rewards//std": 0.03283890709280968, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2176, "grad_norm": 36.97765350341797, "kl": 4.06044471822679, "learning_rate": 8.95563247139113e-07, "loss": 0.406, "num_tokens": 9406272.0, "reward": 0.7430419921875, "reward_std": 0.015127220191061497, "rewards//mean": 0.7430419921875, "rewards//std": 0.055607471615076065, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2178, "grad_norm": 13.865893363952637, "kl": 2.522465394809842, "learning_rate": 8.953690701897827e-07, "loss": 0.2522, "num_tokens": 9414848.0, "reward": 0.7244873046875, "reward_std": 0.017161235213279724, "rewards//mean": 0.7244873046875, "rewards//std": 0.03700541704893112, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.218, "grad_norm": 6.7566680908203125, "kl": 1.7745242714881897, "learning_rate": 8.951747339858382e-07, "loss": 0.1775, "num_tokens": 9423448.0, "reward": 0.7322998046875, "reward_std": 0.008848993107676506, "rewards//mean": 0.7322998046875, "rewards//std": 0.03039320930838585, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2182, "grad_norm": 32.137081146240234, "kl": 2.067481989040971, "learning_rate": 8.94980238605558e-07, "loss": 0.2067, "num_tokens": 9432040.0, "reward": 0.748046875, "reward_std": 0.01435694471001625, "rewards//mean": 0.748046875, "rewards//std": 0.03106343187391758, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2184, "grad_norm": 27.685312271118164, "kl": 1.8618167992681265, "learning_rate": 8.947855841272851e-07, "loss": 0.1862, "num_tokens": 9440632.0, "reward": 0.75518798828125, "reward_std": 0.010974636301398277, "rewards//mean": 0.75518798828125, "rewards//std": 0.02978210709989071, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2186, "grad_norm": 7.296536445617676, "kl": 1.5944556891918182, "learning_rate": 8.94590770629426e-07, "loss": 0.1594, "num_tokens": 9449312.0, "reward": 0.6986083984375, "reward_std": 0.010842295363545418, "rewards//mean": 0.6986083984375, "rewards//std": 0.03731987997889519, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2188, "grad_norm": 4.455016613006592, "kl": 1.0433367993682623, "learning_rate": 8.943957981904517e-07, "loss": 0.1043, "num_tokens": 9458032.0, "reward": 0.76824951171875, "reward_std": 0.00847709272056818, "rewards//mean": 0.76824951171875, "rewards//std": 0.031102817505598068, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.219, "grad_norm": 3.9581363201141357, "kl": 0.7808334045112133, "learning_rate": 8.942006668888971e-07, "loss": 0.0781, "num_tokens": 9466552.0, "reward": 0.756103515625, "reward_std": 0.006937914527952671, "rewards//mean": 0.756103515625, "rewards//std": 0.024068424478173256, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2192, "grad_norm": 3.87412166595459, "kl": 1.3768600430339575, "learning_rate": 8.940053768033608e-07, "loss": 0.1377, "num_tokens": 9475248.0, "reward": 0.75006103515625, "reward_std": 0.008184343576431274, "rewards//mean": 0.75006103515625, "rewards//std": 0.036851439625024796, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2194, "grad_norm": 5.8469061851501465, "kl": 0.9254527110606432, "learning_rate": 8.938099280125062e-07, "loss": 0.0925, "num_tokens": 9483984.0, "reward": 0.77789306640625, "reward_std": 0.01542261429131031, "rewards//mean": 0.77789306640625, "rewards//std": 0.036958906799554825, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2196, "grad_norm": 8.163247108459473, "kl": 1.3382260501384735, "learning_rate": 8.936143205950595e-07, "loss": 0.1338, "num_tokens": 9492560.0, "reward": 0.74798583984375, "reward_std": 0.01950734481215477, "rewards//mean": 0.74798583984375, "rewards//std": 0.03747384622693062, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2198, "grad_norm": 4.726160526275635, "kl": 0.6422359738498926, "learning_rate": 8.934185546298115e-07, "loss": 0.0642, "num_tokens": 9501136.0, "reward": 0.70245361328125, "reward_std": 0.0035011344589293003, "rewards//mean": 0.70245361328125, "rewards//std": 0.038333695381879807, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.22, "grad_norm": 3.5903127193450928, "kl": 1.405467739328742, "learning_rate": 8.932226301956169e-07, "loss": 0.1405, "num_tokens": 9509816.0, "reward": 0.7440185546875, "reward_std": 0.014332575723528862, "rewards//mean": 0.7440185546875, "rewards//std": 0.034470751881599426, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2202, "grad_norm": 4.708259582519531, "kl": 1.6381273418664932, "learning_rate": 8.930265473713937e-07, "loss": 0.1638, "num_tokens": 9518472.0, "reward": 0.75689697265625, "reward_std": 0.021110452711582184, "rewards//mean": 0.75689697265625, "rewards//std": 0.04539315402507782, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2204, "grad_norm": 3.713083267211914, "kl": 0.80934071354568, "learning_rate": 8.928303062361243e-07, "loss": 0.0809, "num_tokens": 9527136.0, "reward": 0.73992919921875, "reward_std": 0.006100708618760109, "rewards//mean": 0.73992919921875, "rewards//std": 0.03321719914674759, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2206, "grad_norm": 5.256967544555664, "kl": 1.395573116838932, "learning_rate": 8.926339068688545e-07, "loss": 0.1396, "num_tokens": 9535776.0, "reward": 0.74481201171875, "reward_std": 0.014384103938937187, "rewards//mean": 0.74481201171875, "rewards//std": 0.041369467973709106, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2208, "grad_norm": 9.163372993469238, "kl": 1.5889676082879305, "learning_rate": 8.924373493486941e-07, "loss": 0.1589, "num_tokens": 9544424.0, "reward": 0.74603271484375, "reward_std": 0.018748531118035316, "rewards//mean": 0.74603271484375, "rewards//std": 0.04023231193423271, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.221, "grad_norm": 4.666876316070557, "kl": 1.8292632326483727, "learning_rate": 8.922406337548161e-07, "loss": 0.1829, "num_tokens": 9553032.0, "reward": 0.71136474609375, "reward_std": 0.013210605829954147, "rewards//mean": 0.71136474609375, "rewards//std": 0.029798876494169235, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2212, "grad_norm": 3.660369634628296, "kl": 2.339859602972865, "learning_rate": 8.920437601664579e-07, "loss": 0.234, "num_tokens": 9561712.0, "reward": 0.77392578125, "reward_std": 0.023278575390577316, "rewards//mean": 0.77392578125, "rewards//std": 0.041721854358911514, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2214, "grad_norm": 3.374110698699951, "kl": 1.1332755852490664, "learning_rate": 8.918467286629198e-07, "loss": 0.1133, "num_tokens": 9570320.0, "reward": 0.73443603515625, "reward_std": 0.00900747999548912, "rewards//mean": 0.73443603515625, "rewards//std": 0.03303944692015648, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2216, "grad_norm": 8.907243728637695, "kl": 1.6807291693985462, "learning_rate": 8.916495393235665e-07, "loss": 0.1681, "num_tokens": 9578848.0, "reward": 0.74151611328125, "reward_std": 0.008039504289627075, "rewards//mean": 0.74151611328125, "rewards//std": 0.03710075840353966, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2218, "grad_norm": 5.270492076873779, "kl": 1.2160240355879068, "learning_rate": 8.914521922278255e-07, "loss": 0.1216, "num_tokens": 9587480.0, "reward": 0.73681640625, "reward_std": 0.008157813921570778, "rewards//mean": 0.73681640625, "rewards//std": 0.03306881710886955, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.222, "grad_norm": 3.2378830909729004, "kl": 1.6577156893908978, "learning_rate": 8.912546874551882e-07, "loss": 0.1658, "num_tokens": 9596152.0, "reward": 0.75250244140625, "reward_std": 0.011570584028959274, "rewards//mean": 0.75250244140625, "rewards//std": 0.03636099398136139, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2222, "grad_norm": 3.7634778022766113, "kl": 1.8262662645429373, "learning_rate": 8.910570250852096e-07, "loss": 0.1826, "num_tokens": 9604800.0, "reward": 0.74334716796875, "reward_std": 0.015612797811627388, "rewards//mean": 0.74334716796875, "rewards//std": 0.03957182541489601, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2224, "grad_norm": 5.38132381439209, "kl": 1.1248016580939293, "learning_rate": 8.908592051975081e-07, "loss": 0.1125, "num_tokens": 9613480.0, "reward": 0.76190185546875, "reward_std": 0.0102156363427639, "rewards//mean": 0.76190185546875, "rewards//std": 0.027528515085577965, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2226, "grad_norm": 3.525296926498413, "kl": 1.8597209546715021, "learning_rate": 8.906612278717655e-07, "loss": 0.186, "num_tokens": 9622088.0, "reward": 0.7320556640625, "reward_std": 0.016202254220843315, "rewards//mean": 0.7320556640625, "rewards//std": 0.043121904134750366, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2228, "grad_norm": 6.389703273773193, "kl": 0.8898295853286982, "learning_rate": 8.90463093187727e-07, "loss": 0.089, "num_tokens": 9630704.0, "reward": 0.762939453125, "reward_std": 0.008746866136789322, "rewards//mean": 0.762939453125, "rewards//std": 0.017946293577551842, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.223, "grad_norm": 4.210634231567383, "kl": 0.8167678322643042, "learning_rate": 8.902648012252012e-07, "loss": 0.0817, "num_tokens": 9639360.0, "reward": 0.7841796875, "reward_std": 0.007580064702779055, "rewards//mean": 0.7841796875, "rewards//std": 0.023602265864610672, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2232, "grad_norm": 7.35414457321167, "kl": 1.5044339783489704, "learning_rate": 8.900663520640603e-07, "loss": 0.1504, "num_tokens": 9647976.0, "reward": 0.732666015625, "reward_std": 0.012166472151875496, "rewards//mean": 0.732666015625, "rewards//std": 0.035268884152173996, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2234, "grad_norm": 4.93861722946167, "kl": 1.047438146546483, "learning_rate": 8.898677457842394e-07, "loss": 0.1047, "num_tokens": 9656608.0, "reward": 0.78271484375, "reward_std": 0.009562061168253422, "rewards//mean": 0.78271484375, "rewards//std": 0.028843844309449196, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2236, "grad_norm": 12.119524002075195, "kl": 1.4735262338072062, "learning_rate": 8.896689824657371e-07, "loss": 0.1474, "num_tokens": 9665184.0, "reward": 0.7591552734375, "reward_std": 0.008323341608047485, "rewards//mean": 0.7591552734375, "rewards//std": 0.024902725592255592, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2238, "grad_norm": 16.558944702148438, "kl": 1.3315905779600143, "learning_rate": 8.894700621886152e-07, "loss": 0.1332, "num_tokens": 9673856.0, "reward": 0.75994873046875, "reward_std": 0.012154627591371536, "rewards//mean": 0.75994873046875, "rewards//std": 0.03866319730877876, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.224, "grad_norm": 3.9123525619506836, "kl": 1.5547582395374775, "learning_rate": 8.892709850329989e-07, "loss": 0.1555, "num_tokens": 9682448.0, "reward": 0.73846435546875, "reward_std": 0.009352664463222027, "rewards//mean": 0.73846435546875, "rewards//std": 0.03554629534482956, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2242, "grad_norm": 2.711242198944092, "kl": 1.0546619202941656, "learning_rate": 8.890717510790762e-07, "loss": 0.1055, "num_tokens": 9691064.0, "reward": 0.74713134765625, "reward_std": 0.00806482508778572, "rewards//mean": 0.74713134765625, "rewards//std": 0.026173792779445648, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2244, "grad_norm": 1.9054088592529297, "kl": 1.5269436184316874, "learning_rate": 8.888723604070989e-07, "loss": 0.1527, "num_tokens": 9699704.0, "reward": 0.76837158203125, "reward_std": 0.011166717857122421, "rewards//mean": 0.76837158203125, "rewards//std": 0.03374438360333443, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2246, "grad_norm": 8.689998626708984, "kl": 1.329160338267684, "learning_rate": 8.886728130973813e-07, "loss": 0.1329, "num_tokens": 9708296.0, "reward": 0.772216796875, "reward_std": 0.012064231559634209, "rewards//mean": 0.772216796875, "rewards//std": 0.03167245164513588, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2248, "grad_norm": 3.48347806930542, "kl": 1.6822141632437706, "learning_rate": 8.884731092303011e-07, "loss": 0.1682, "num_tokens": 9717040.0, "reward": 0.7603759765625, "reward_std": 0.010618474334478378, "rewards//mean": 0.7603759765625, "rewards//std": 0.028883440420031548, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.225, "grad_norm": 7.801677703857422, "kl": 1.9016104098409414, "learning_rate": 8.882732488862987e-07, "loss": 0.1902, "num_tokens": 9725680.0, "reward": 0.73150634765625, "reward_std": 0.011720804497599602, "rewards//mean": 0.73150634765625, "rewards//std": 0.034381214529275894, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2252, "grad_norm": 8.066940307617188, "kl": 2.135226909071207, "learning_rate": 8.880732321458784e-07, "loss": 0.2135, "num_tokens": 9734240.0, "reward": 0.7755126953125, "reward_std": 0.011359816417098045, "rewards//mean": 0.7755126953125, "rewards//std": 0.025581583380699158, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2254, "grad_norm": 1.7799408435821533, "kl": 0.9675129484385252, "learning_rate": 8.878730590896065e-07, "loss": 0.0968, "num_tokens": 9742928.0, "reward": 0.7236328125, "reward_std": 0.004742627497762442, "rewards//mean": 0.7236328125, "rewards//std": 0.03784691169857979, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2256, "grad_norm": 2.759489059448242, "kl": 0.9137209337204695, "learning_rate": 8.876727297981127e-07, "loss": 0.0914, "num_tokens": 9751496.0, "reward": 0.7816162109375, "reward_std": 0.007736141327768564, "rewards//mean": 0.7816162109375, "rewards//std": 0.024604294449090958, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2258, "grad_norm": 2.2090072631835938, "kl": 1.8953998424112797, "learning_rate": 8.874722443520898e-07, "loss": 0.1895, "num_tokens": 9760136.0, "reward": 0.76629638671875, "reward_std": 0.017574312165379524, "rewards//mean": 0.76629638671875, "rewards//std": 0.03510836884379387, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.226, "grad_norm": 3.8313753604888916, "kl": 3.062817746773362, "learning_rate": 8.872716028322931e-07, "loss": 0.3063, "num_tokens": 9768864.0, "reward": 0.76080322265625, "reward_std": 0.021771468222141266, "rewards//mean": 0.76080322265625, "rewards//std": 0.039504826068878174, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2262, "grad_norm": 10.609853744506836, "kl": 2.610361535102129, "learning_rate": 8.870708053195413e-07, "loss": 0.261, "num_tokens": 9777504.0, "reward": 0.73651123046875, "reward_std": 0.013166049495339394, "rewards//mean": 0.73651123046875, "rewards//std": 0.04736807569861412, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2264, "grad_norm": 6.4367289543151855, "kl": 1.8171419892460108, "learning_rate": 8.868698518947151e-07, "loss": 0.1817, "num_tokens": 9786088.0, "reward": 0.7452392578125, "reward_std": 0.008618181571364403, "rewards//mean": 0.7452392578125, "rewards//std": 0.024631349369883537, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2266, "grad_norm": 2.1947710514068604, "kl": 1.0404817126691341, "learning_rate": 8.866687426387591e-07, "loss": 0.104, "num_tokens": 9794704.0, "reward": 0.7593994140625, "reward_std": 0.00733649218454957, "rewards//mean": 0.7593994140625, "rewards//std": 0.03120904229581356, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2268, "grad_norm": 9.567975044250488, "kl": 2.9677893854677677, "learning_rate": 8.864674776326797e-07, "loss": 0.2968, "num_tokens": 9803360.0, "reward": 0.7906494140625, "reward_std": 0.015689987689256668, "rewards//mean": 0.7906494140625, "rewards//std": 0.02821749821305275, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.227, "grad_norm": 1.8530478477478027, "kl": 1.2280470710247755, "learning_rate": 8.862660569575464e-07, "loss": 0.1228, "num_tokens": 9812040.0, "reward": 0.76318359375, "reward_std": 0.007310510613024235, "rewards//mean": 0.76318359375, "rewards//std": 0.025205127894878387, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2272, "grad_norm": 11.196718215942383, "kl": 3.1078683994710445, "learning_rate": 8.860644806944917e-07, "loss": 0.3108, "num_tokens": 9820664.0, "reward": 0.72747802734375, "reward_std": 0.008620038628578186, "rewards//mean": 0.72747802734375, "rewards//std": 0.03392379358410835, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2274, "grad_norm": 6.063144207000732, "kl": 3.298464583232999, "learning_rate": 8.858627489247104e-07, "loss": 0.3298, "num_tokens": 9829264.0, "reward": 0.75042724609375, "reward_std": 0.017651591449975967, "rewards//mean": 0.75042724609375, "rewards//std": 0.036085180938243866, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2276, "grad_norm": 27.15064811706543, "kl": 3.2940468601882458, "learning_rate": 8.856608617294599e-07, "loss": 0.3294, "num_tokens": 9837976.0, "reward": 0.7381591796875, "reward_std": 0.014517206698656082, "rewards//mean": 0.7381591796875, "rewards//std": 0.039622530341148376, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2278, "grad_norm": 2.138314962387085, "kl": 1.4599057100713253, "learning_rate": 8.854588191900604e-07, "loss": 0.146, "num_tokens": 9846624.0, "reward": 0.7435302734375, "reward_std": 0.010030900128185749, "rewards//mean": 0.7435302734375, "rewards//std": 0.027693333104252815, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.228, "grad_norm": 3.1714651584625244, "kl": 1.9531475640833378, "learning_rate": 8.852566213878946e-07, "loss": 0.1953, "num_tokens": 9855208.0, "reward": 0.7593994140625, "reward_std": 0.014453301206231117, "rewards//mean": 0.7593994140625, "rewards//std": 0.0359075553715229, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2282, "grad_norm": 2.9468600749969482, "kl": 2.570426480844617, "learning_rate": 8.850542684044078e-07, "loss": 0.257, "num_tokens": 9863816.0, "reward": 0.7938232421875, "reward_std": 0.016855968162417412, "rewards//mean": 0.7938232421875, "rewards//std": 0.032752130180597305, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2284, "grad_norm": 4.921816825866699, "kl": 1.8722131662070751, "learning_rate": 8.848517603211078e-07, "loss": 0.1872, "num_tokens": 9872392.0, "reward": 0.758544921875, "reward_std": 0.009364070370793343, "rewards//mean": 0.758544921875, "rewards//std": 0.03168009966611862, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2286, "grad_norm": 5.602573394775391, "kl": 2.316109459847212, "learning_rate": 8.846490972195646e-07, "loss": 0.2316, "num_tokens": 9881024.0, "reward": 0.74676513671875, "reward_std": 0.01074385829269886, "rewards//mean": 0.74676513671875, "rewards//std": 0.038679640740156174, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2288, "grad_norm": 46.09404754638672, "kl": 2.3865990675985813, "learning_rate": 8.844462791814112e-07, "loss": 0.2387, "num_tokens": 9889648.0, "reward": 0.74615478515625, "reward_std": 0.007766093127429485, "rewards//mean": 0.74615478515625, "rewards//std": 0.029687926173210144, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.229, "grad_norm": 3.9606568813323975, "kl": 1.375477273017168, "learning_rate": 8.842433062883425e-07, "loss": 0.1375, "num_tokens": 9898224.0, "reward": 0.763427734375, "reward_std": 0.011231972835958004, "rewards//mean": 0.763427734375, "rewards//std": 0.03136507794260979, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2292, "grad_norm": 10.372222900390625, "kl": 2.170891275629401, "learning_rate": 8.840401786221159e-07, "loss": 0.2171, "num_tokens": 9906824.0, "reward": 0.73858642578125, "reward_std": 0.01340460404753685, "rewards//mean": 0.73858642578125, "rewards//std": 0.03745647519826889, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2294, "grad_norm": 3.0244295597076416, "kl": 1.6726904660463333, "learning_rate": 8.838368962645513e-07, "loss": 0.1673, "num_tokens": 9915560.0, "reward": 0.7413330078125, "reward_std": 0.010628901422023773, "rewards//mean": 0.7413330078125, "rewards//std": 0.030895095318555832, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2296, "grad_norm": 6.284627437591553, "kl": 2.278836591169238, "learning_rate": 8.836334592975308e-07, "loss": 0.2279, "num_tokens": 9924232.0, "reward": 0.70709228515625, "reward_std": 0.014068431220948696, "rewards//mean": 0.70709228515625, "rewards//std": 0.0403585359454155, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2298, "grad_norm": 7.906750202178955, "kl": 2.891849149018526, "learning_rate": 8.834298678029988e-07, "loss": 0.2892, "num_tokens": 9932832.0, "reward": 0.752197265625, "reward_std": 0.025658084079623222, "rewards//mean": 0.752197265625, "rewards//std": 0.042965229600667953, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.23, "grad_norm": 3.2445545196533203, "kl": 1.7269501145929098, "learning_rate": 8.83226121862962e-07, "loss": 0.1727, "num_tokens": 9941384.0, "reward": 0.7686767578125, "reward_std": 0.023529019206762314, "rewards//mean": 0.7686767578125, "rewards//std": 0.039650026708841324, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2302, "grad_norm": 3.0770299434661865, "kl": 1.5299071036279202, "learning_rate": 8.83022221559489e-07, "loss": 0.153, "num_tokens": 9949968.0, "reward": 0.7440185546875, "reward_std": 0.015604786574840546, "rewards//mean": 0.7440185546875, "rewards//std": 0.0306471548974514, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2304, "grad_norm": 3.2209055423736572, "kl": 1.4628514032810926, "learning_rate": 8.82818166974711e-07, "loss": 0.1463, "num_tokens": 9958640.0, "reward": 0.7689208984375, "reward_std": 0.009194627404212952, "rewards//mean": 0.7689208984375, "rewards//std": 0.029513860121369362, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2306, "grad_norm": 2.5282833576202393, "kl": 1.9720770809799433, "learning_rate": 8.826139581908211e-07, "loss": 0.1972, "num_tokens": 9967248.0, "reward": 0.76751708984375, "reward_std": 0.013500608503818512, "rewards//mean": 0.76751708984375, "rewards//std": 0.033320486545562744, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2308, "grad_norm": 7.129243850708008, "kl": 1.0265081953257322, "learning_rate": 8.824095952900746e-07, "loss": 0.1027, "num_tokens": 9975864.0, "reward": 0.76123046875, "reward_std": 0.006395334843546152, "rewards//mean": 0.76123046875, "rewards//std": 0.028919318690896034, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.231, "grad_norm": 1.9215022325515747, "kl": 1.4127433262765408, "learning_rate": 8.822050783547889e-07, "loss": 0.1413, "num_tokens": 9984496.0, "reward": 0.77972412109375, "reward_std": 0.013265897519886494, "rewards//mean": 0.77972412109375, "rewards//std": 0.035527125000953674, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2312, "grad_norm": 4.380608558654785, "kl": 1.1788012031465769, "learning_rate": 8.820004074673433e-07, "loss": 0.1179, "num_tokens": 9993096.0, "reward": 0.7825927734375, "reward_std": 0.014705965295433998, "rewards//mean": 0.7825927734375, "rewards//std": 0.028933709487318993, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2314, "grad_norm": 1.7834618091583252, "kl": 1.9318585358560085, "learning_rate": 8.817955827101792e-07, "loss": 0.1932, "num_tokens": 10001696.0, "reward": 0.7471923828125, "reward_std": 0.012094835750758648, "rewards//mean": 0.7471923828125, "rewards//std": 0.03956441581249237, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2316, "grad_norm": 3.228813886642456, "kl": 1.569538813084364, "learning_rate": 8.815906041658001e-07, "loss": 0.157, "num_tokens": 10010312.0, "reward": 0.756103515625, "reward_std": 0.010903415270149708, "rewards//mean": 0.756103515625, "rewards//std": 0.029967118054628372, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2318, "grad_norm": 3.7323246002197266, "kl": 1.4793742503970861, "learning_rate": 8.813854719167712e-07, "loss": 0.1479, "num_tokens": 10018944.0, "reward": 0.75079345703125, "reward_std": 0.009869174100458622, "rewards//mean": 0.75079345703125, "rewards//std": 0.029381943866610527, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.232, "grad_norm": 6.703781604766846, "kl": 1.9111527763307095, "learning_rate": 8.8118018604572e-07, "loss": 0.1911, "num_tokens": 10027520.0, "reward": 0.75616455078125, "reward_std": 0.01446323562413454, "rewards//mean": 0.75616455078125, "rewards//std": 0.037213198840618134, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2322, "grad_norm": 4.319847583770752, "kl": 0.7886298522353172, "learning_rate": 8.809747466353355e-07, "loss": 0.0789, "num_tokens": 10036120.0, "reward": 0.77099609375, "reward_std": 0.005149394273757935, "rewards//mean": 0.77099609375, "rewards//std": 0.026176178827881813, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2324, "grad_norm": 4.049602508544922, "kl": 2.0440140943974257, "learning_rate": 8.807691537683684e-07, "loss": 0.2044, "num_tokens": 10044704.0, "reward": 0.7349853515625, "reward_std": 0.01736447401344776, "rewards//mean": 0.7349853515625, "rewards//std": 0.03697431460022926, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2326, "grad_norm": 3.7227745056152344, "kl": 1.336763946339488, "learning_rate": 8.805634075276317e-07, "loss": 0.1337, "num_tokens": 10053336.0, "reward": 0.75250244140625, "reward_std": 0.01371496170759201, "rewards//mean": 0.75250244140625, "rewards//std": 0.03393539413809776, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2328, "grad_norm": 2.9432053565979004, "kl": 1.2952180672436953, "learning_rate": 8.80357507996e-07, "loss": 0.1295, "num_tokens": 10061936.0, "reward": 0.7847900390625, "reward_std": 0.011655289679765701, "rewards//mean": 0.7847900390625, "rewards//std": 0.024436375126242638, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.233, "grad_norm": 6.470463275909424, "kl": 0.8546751197427511, "learning_rate": 8.801514552564095e-07, "loss": 0.0855, "num_tokens": 10070680.0, "reward": 0.775390625, "reward_std": 0.009649467654526234, "rewards//mean": 0.775390625, "rewards//std": 0.029643084853887558, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2332, "grad_norm": 11.972208023071289, "kl": 2.483306748792529, "learning_rate": 8.799452493918585e-07, "loss": 0.2483, "num_tokens": 10079240.0, "reward": 0.7391357421875, "reward_std": 0.010093813762068748, "rewards//mean": 0.7391357421875, "rewards//std": 0.03905920684337616, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2334, "grad_norm": 0.9611997604370117, "kl": 0.6028448455035686, "learning_rate": 8.797388904854063e-07, "loss": 0.0603, "num_tokens": 10087896.0, "reward": 0.7562255859375, "reward_std": 0.0015408683102577925, "rewards//mean": 0.7562255859375, "rewards//std": 0.025188006460666656, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2336, "grad_norm": 4.824442386627197, "kl": 1.263984639197588, "learning_rate": 8.795323786201745e-07, "loss": 0.1264, "num_tokens": 10096480.0, "reward": 0.77398681640625, "reward_std": 0.016778945922851562, "rewards//mean": 0.77398681640625, "rewards//std": 0.03552883118391037, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2338, "grad_norm": 3.289353847503662, "kl": 0.8853769600391388, "learning_rate": 8.79325713879346e-07, "loss": 0.0885, "num_tokens": 10105048.0, "reward": 0.791259765625, "reward_std": 0.00915705505758524, "rewards//mean": 0.791259765625, "rewards//std": 0.028055289760231972, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.234, "grad_norm": 4.047274589538574, "kl": 1.974445316940546, "learning_rate": 8.791188963461652e-07, "loss": 0.1974, "num_tokens": 10113640.0, "reward": 0.7313232421875, "reward_std": 0.011279085651040077, "rewards//mean": 0.7313232421875, "rewards//std": 0.03313444182276726, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2342, "grad_norm": 2.8111186027526855, "kl": 0.6526630073785782, "learning_rate": 8.789119261039384e-07, "loss": 0.0653, "num_tokens": 10122320.0, "reward": 0.779052734375, "reward_std": 0.005872816778719425, "rewards//mean": 0.779052734375, "rewards//std": 0.02503519132733345, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2344, "grad_norm": 1.8342902660369873, "kl": 1.5613654907792807, "learning_rate": 8.78704803236033e-07, "loss": 0.1561, "num_tokens": 10131064.0, "reward": 0.7276611328125, "reward_std": 0.0104688024148345, "rewards//mean": 0.7276611328125, "rewards//std": 0.021959304809570312, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2346, "grad_norm": 3.295234441757202, "kl": 1.537804240360856, "learning_rate": 8.784975278258782e-07, "loss": 0.1538, "num_tokens": 10139696.0, "reward": 0.726806640625, "reward_std": 0.010460578836500645, "rewards//mean": 0.726806640625, "rewards//std": 0.03383180499076843, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2348, "grad_norm": 3.2483508586883545, "kl": 0.9632603842765093, "learning_rate": 8.782900999569645e-07, "loss": 0.0963, "num_tokens": 10148320.0, "reward": 0.75445556640625, "reward_std": 0.007059913594275713, "rewards//mean": 0.75445556640625, "rewards//std": 0.029255446046590805, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.235, "grad_norm": 4.360206604003906, "kl": 1.2281594015657902, "learning_rate": 8.780825197128437e-07, "loss": 0.1228, "num_tokens": 10156992.0, "reward": 0.7880859375, "reward_std": 0.008820902556180954, "rewards//mean": 0.7880859375, "rewards//std": 0.03365325182676315, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2352, "grad_norm": 2.771652936935425, "kl": 0.9551923777908087, "learning_rate": 8.778747871771291e-07, "loss": 0.0955, "num_tokens": 10165576.0, "reward": 0.76788330078125, "reward_std": 0.00427279295399785, "rewards//mean": 0.76788330078125, "rewards//std": 0.015069565735757351, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2354, "grad_norm": 11.196528434753418, "kl": 1.2639826629310846, "learning_rate": 8.776669024334955e-07, "loss": 0.1264, "num_tokens": 10174256.0, "reward": 0.75341796875, "reward_std": 0.00726303830742836, "rewards//mean": 0.75341796875, "rewards//std": 0.02312016673386097, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2356, "grad_norm": 8.246557235717773, "kl": 1.1232012081891298, "learning_rate": 8.774588655656787e-07, "loss": 0.1123, "num_tokens": 10182920.0, "reward": 0.731201171875, "reward_std": 0.005265967454761267, "rewards//mean": 0.731201171875, "rewards//std": 0.040372833609580994, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2358, "grad_norm": 3.1767334938049316, "kl": 1.5803054478019476, "learning_rate": 8.772506766574761e-07, "loss": 0.158, "num_tokens": 10191600.0, "reward": 0.7603759765625, "reward_std": 0.012790179811418056, "rewards//mean": 0.7603759765625, "rewards//std": 0.03148334100842476, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.236, "grad_norm": 3.4187545776367188, "kl": 1.6482271905988455, "learning_rate": 8.770423357927462e-07, "loss": 0.1648, "num_tokens": 10200344.0, "reward": 0.73138427734375, "reward_std": 0.008016904816031456, "rewards//mean": 0.73138427734375, "rewards//std": 0.02673683688044548, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2362, "grad_norm": 5.574641227722168, "kl": 2.4331605937331915, "learning_rate": 8.768338430554082e-07, "loss": 0.2433, "num_tokens": 10208960.0, "reward": 0.7445068359375, "reward_std": 0.01510899793356657, "rewards//mean": 0.7445068359375, "rewards//std": 0.04163595661520958, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2364, "grad_norm": 3.7420549392700195, "kl": 2.0698746386915445, "learning_rate": 8.766251985294434e-07, "loss": 0.207, "num_tokens": 10217648.0, "reward": 0.75933837890625, "reward_std": 0.013081587851047516, "rewards//mean": 0.75933837890625, "rewards//std": 0.03291551023721695, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2366, "grad_norm": 2.152587890625, "kl": 1.5023798700422049, "learning_rate": 8.764164022988937e-07, "loss": 0.1502, "num_tokens": 10226272.0, "reward": 0.7518310546875, "reward_std": 0.009207741357386112, "rewards//mean": 0.7518310546875, "rewards//std": 0.028125077486038208, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2368, "grad_norm": 2.536895751953125, "kl": 1.8943487722426653, "learning_rate": 8.762074544478621e-07, "loss": 0.1894, "num_tokens": 10234856.0, "reward": 0.74957275390625, "reward_std": 0.011271432042121887, "rewards//mean": 0.74957275390625, "rewards//std": 0.029635872691869736, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.237, "grad_norm": 6.344274520874023, "kl": 1.4856875035911798, "learning_rate": 8.75998355060513e-07, "loss": 0.1486, "num_tokens": 10243440.0, "reward": 0.73614501953125, "reward_std": 0.008308660238981247, "rewards//mean": 0.73614501953125, "rewards//std": 0.03561776876449585, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2372, "grad_norm": 2.1593058109283447, "kl": 2.0834833960980177, "learning_rate": 8.757891042210712e-07, "loss": 0.2083, "num_tokens": 10252096.0, "reward": 0.76531982421875, "reward_std": 0.014573352411389351, "rewards//mean": 0.76531982421875, "rewards//std": 0.03263237699866295, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2374, "grad_norm": 2.0074822902679443, "kl": 1.9211790971457958, "learning_rate": 8.755797020138234e-07, "loss": 0.1921, "num_tokens": 10260736.0, "reward": 0.74188232421875, "reward_std": 0.012608667835593224, "rewards//mean": 0.74188232421875, "rewards//std": 0.029588330537080765, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2376, "grad_norm": 2.497415065765381, "kl": 1.0434564761817455, "learning_rate": 8.753701485231164e-07, "loss": 0.1043, "num_tokens": 10269320.0, "reward": 0.74847412109375, "reward_std": 0.007842149585485458, "rewards//mean": 0.74847412109375, "rewards//std": 0.034475311636924744, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2378, "grad_norm": 3.1493000984191895, "kl": 1.3220611158758402, "learning_rate": 8.751604438333586e-07, "loss": 0.1322, "num_tokens": 10278008.0, "reward": 0.766357421875, "reward_std": 0.010030495002865791, "rewards//mean": 0.766357421875, "rewards//std": 0.024733463302254677, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.238, "grad_norm": 2.0060431957244873, "kl": 1.1081401947885752, "learning_rate": 8.749505880290188e-07, "loss": 0.1108, "num_tokens": 10286632.0, "reward": 0.76080322265625, "reward_std": 0.007352760061621666, "rewards//mean": 0.76080322265625, "rewards//std": 0.0350336991250515, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2382, "grad_norm": 4.355103969573975, "kl": 1.3405583892017603, "learning_rate": 8.74740581194627e-07, "loss": 0.1341, "num_tokens": 10295192.0, "reward": 0.74468994140625, "reward_std": 0.01251955982297659, "rewards//mean": 0.74468994140625, "rewards//std": 0.025338031351566315, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2384, "grad_norm": 3.6300463676452637, "kl": 2.07859293743968, "learning_rate": 8.745304234147739e-07, "loss": 0.2079, "num_tokens": 10303872.0, "reward": 0.7431640625, "reward_std": 0.012090795673429966, "rewards//mean": 0.7431640625, "rewards//std": 0.03787250071763992, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2386, "grad_norm": 3.209120988845825, "kl": 0.951006256043911, "learning_rate": 8.743201147741111e-07, "loss": 0.0951, "num_tokens": 10312528.0, "reward": 0.76507568359375, "reward_std": 0.008098564110696316, "rewards//mean": 0.76507568359375, "rewards//std": 0.025189433246850967, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2388, "grad_norm": 2.1020562648773193, "kl": 0.973062552511692, "learning_rate": 8.741096553573506e-07, "loss": 0.0973, "num_tokens": 10321160.0, "reward": 0.7633056640625, "reward_std": 0.005382574163377285, "rewards//mean": 0.7633056640625, "rewards//std": 0.03894898667931557, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.239, "grad_norm": 1.7738032341003418, "kl": 1.6766906436532736, "learning_rate": 8.73899045249266e-07, "loss": 0.1677, "num_tokens": 10329808.0, "reward": 0.78106689453125, "reward_std": 0.01804327219724655, "rewards//mean": 0.78106689453125, "rewards//std": 0.02997160144150257, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2392, "grad_norm": 7.119793891906738, "kl": 1.8139428477734327, "learning_rate": 8.736882845346905e-07, "loss": 0.1814, "num_tokens": 10338520.0, "reward": 0.72161865234375, "reward_std": 0.004580400418490171, "rewards//mean": 0.72161865234375, "rewards//std": 0.023481298238039017, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2394, "grad_norm": 3.3124215602874756, "kl": 1.0021106284111738, "learning_rate": 8.734773732985185e-07, "loss": 0.1002, "num_tokens": 10347160.0, "reward": 0.74310302734375, "reward_std": 0.006465718150138855, "rewards//mean": 0.74310302734375, "rewards//std": 0.03422942012548447, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2396, "grad_norm": 2.3718671798706055, "kl": 1.407300479710102, "learning_rate": 8.732663116257055e-07, "loss": 0.1407, "num_tokens": 10355760.0, "reward": 0.7791748046875, "reward_std": 0.00996287353336811, "rewards//mean": 0.7791748046875, "rewards//std": 0.03363142907619476, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2398, "grad_norm": 4.85661506652832, "kl": 2.3818425964564085, "learning_rate": 8.730550996012667e-07, "loss": 0.2382, "num_tokens": 10364384.0, "reward": 0.74798583984375, "reward_std": 0.016918540000915527, "rewards//mean": 0.74798583984375, "rewards//std": 0.027125800028443336, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.24, "grad_norm": 2.8339829444885254, "kl": 1.1268452275544405, "learning_rate": 8.728437373102784e-07, "loss": 0.1127, "num_tokens": 10372912.0, "reward": 0.77685546875, "reward_std": 0.009339243173599243, "rewards//mean": 0.77685546875, "rewards//std": 0.02476893737912178, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2402, "grad_norm": 2.791949987411499, "kl": 1.2671589367091656, "learning_rate": 8.726322248378774e-07, "loss": 0.1267, "num_tokens": 10381504.0, "reward": 0.7745361328125, "reward_std": 0.00692584365606308, "rewards//mean": 0.7745361328125, "rewards//std": 0.021389398723840714, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2404, "grad_norm": 2.6466314792633057, "kl": 1.5692084152251482, "learning_rate": 8.724205622692606e-07, "loss": 0.1569, "num_tokens": 10390168.0, "reward": 0.7591552734375, "reward_std": 0.010664994828402996, "rewards//mean": 0.7591552734375, "rewards//std": 0.027902444824576378, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2406, "grad_norm": 3.629345417022705, "kl": 1.674876980483532, "learning_rate": 8.72208749689686e-07, "loss": 0.1675, "num_tokens": 10398736.0, "reward": 0.7613525390625, "reward_std": 0.012000908143818378, "rewards//mean": 0.7613525390625, "rewards//std": 0.02549385465681553, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2408, "grad_norm": 7.045103073120117, "kl": 1.3719742316752672, "learning_rate": 8.719967871844715e-07, "loss": 0.1372, "num_tokens": 10407320.0, "reward": 0.6990966796875, "reward_std": 0.007162667810916901, "rewards//mean": 0.6990966796875, "rewards//std": 0.04391361027956009, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.241, "grad_norm": 2.055586338043213, "kl": 1.0216310992836952, "learning_rate": 8.717846748389955e-07, "loss": 0.1022, "num_tokens": 10415912.0, "reward": 0.72021484375, "reward_std": 0.005646876059472561, "rewards//mean": 0.72021484375, "rewards//std": 0.03211009502410889, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2412, "grad_norm": 3.369438409805298, "kl": 1.2664784640073776, "learning_rate": 8.71572412738697e-07, "loss": 0.1266, "num_tokens": 10424592.0, "reward": 0.75750732421875, "reward_std": 0.007272562012076378, "rewards//mean": 0.75750732421875, "rewards//std": 0.028316771611571312, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2414, "grad_norm": 2.284891128540039, "kl": 1.4312961101531982, "learning_rate": 8.713600009690751e-07, "loss": 0.1431, "num_tokens": 10433208.0, "reward": 0.75244140625, "reward_std": 0.008055581711232662, "rewards//mean": 0.75244140625, "rewards//std": 0.036594267934560776, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2416, "grad_norm": 6.646371841430664, "kl": 1.6266127917915583, "learning_rate": 8.711474396156892e-07, "loss": 0.1627, "num_tokens": 10441728.0, "reward": 0.74932861328125, "reward_std": 0.014531968161463737, "rewards//mean": 0.74932861328125, "rewards//std": 0.030989699065685272, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2418, "grad_norm": 2.9526915550231934, "kl": 1.5980976950377226, "learning_rate": 8.709347287641592e-07, "loss": 0.1598, "num_tokens": 10450336.0, "reward": 0.75726318359375, "reward_std": 0.012622412294149399, "rewards//mean": 0.75726318359375, "rewards//std": 0.033803097903728485, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.242, "grad_norm": 4.072620868682861, "kl": 1.4195548593997955, "learning_rate": 8.707218685001646e-07, "loss": 0.142, "num_tokens": 10458856.0, "reward": 0.7459716796875, "reward_std": 0.008270646445453167, "rewards//mean": 0.7459716796875, "rewards//std": 0.03224910423159599, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2422, "grad_norm": 0.8886381387710571, "kl": 0.87002008035779, "learning_rate": 8.705088589094458e-07, "loss": 0.087, "num_tokens": 10467424.0, "reward": 0.72772216796875, "reward_std": 0.004259682726114988, "rewards//mean": 0.72772216796875, "rewards//std": 0.033208996057510376, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2424, "grad_norm": 3.159609317779541, "kl": 1.124261612072587, "learning_rate": 8.702957000778029e-07, "loss": 0.1124, "num_tokens": 10476160.0, "reward": 0.773681640625, "reward_std": 0.007289689499884844, "rewards//mean": 0.773681640625, "rewards//std": 0.02921408787369728, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2426, "grad_norm": 3.1278998851776123, "kl": 1.4958477690815926, "learning_rate": 8.700823920910963e-07, "loss": 0.1496, "num_tokens": 10484800.0, "reward": 0.74505615234375, "reward_std": 0.012510336935520172, "rewards//mean": 0.74505615234375, "rewards//std": 0.028900403529405594, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2428, "grad_norm": 2.582415819168091, "kl": 1.4311014134436846, "learning_rate": 8.698689350352464e-07, "loss": 0.1431, "num_tokens": 10493448.0, "reward": 0.7889404296875, "reward_std": 0.007610922213643789, "rewards//mean": 0.7889404296875, "rewards//std": 0.018649086356163025, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.243, "grad_norm": 3.446004629135132, "kl": 1.5160708278417587, "learning_rate": 8.696553289962337e-07, "loss": 0.1516, "num_tokens": 10502088.0, "reward": 0.77960205078125, "reward_std": 0.010885559022426605, "rewards//mean": 0.77960205078125, "rewards//std": 0.021696729585528374, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2432, "grad_norm": 2.9214131832122803, "kl": 1.4365063477307558, "learning_rate": 8.694415740600988e-07, "loss": 0.1437, "num_tokens": 10510608.0, "reward": 0.7723388671875, "reward_std": 0.009778087958693504, "rewards//mean": 0.7723388671875, "rewards//std": 0.02492459863424301, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2434, "grad_norm": 1.2435612678527832, "kl": 0.8110390044748783, "learning_rate": 8.69227670312942e-07, "loss": 0.0811, "num_tokens": 10519232.0, "reward": 0.7650146484375, "reward_std": 0.005041959695518017, "rewards//mean": 0.7650146484375, "rewards//std": 0.020870720967650414, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2436, "grad_norm": 0.4521288275718689, "kl": 0.43685695715248585, "learning_rate": 8.690136178409235e-07, "loss": 0.0437, "num_tokens": 10527872.0, "reward": 0.767578125, "reward_std": 0.0016451351111754775, "rewards//mean": 0.767578125, "rewards//std": 0.02483241632580757, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2438, "grad_norm": 1.9898988008499146, "kl": 1.0086859688162804, "learning_rate": 8.687994167302641e-07, "loss": 0.1009, "num_tokens": 10536560.0, "reward": 0.71978759765625, "reward_std": 0.004595073405653238, "rewards//mean": 0.71978759765625, "rewards//std": 0.024401891976594925, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.244, "grad_norm": 3.703798294067383, "kl": 0.8747746162116528, "learning_rate": 8.685850670672438e-07, "loss": 0.0875, "num_tokens": 10545168.0, "reward": 0.7659912109375, "reward_std": 0.008853060193359852, "rewards//mean": 0.7659912109375, "rewards//std": 0.030550191178917885, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2442, "grad_norm": 2.477851152420044, "kl": 1.1440365593880415, "learning_rate": 8.683705689382024e-07, "loss": 0.1144, "num_tokens": 10553912.0, "reward": 0.73016357421875, "reward_std": 0.00880036223679781, "rewards//mean": 0.73016357421875, "rewards//std": 0.017473505809903145, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2444, "grad_norm": 4.674661159515381, "kl": 0.7348966244608164, "learning_rate": 8.6815592242954e-07, "loss": 0.0735, "num_tokens": 10562520.0, "reward": 0.72906494140625, "reward_std": 0.010388755239546299, "rewards//mean": 0.72906494140625, "rewards//std": 0.04088842123746872, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2446, "grad_norm": 5.477402210235596, "kl": 0.835672477260232, "learning_rate": 8.67941127627716e-07, "loss": 0.0836, "num_tokens": 10571288.0, "reward": 0.78704833984375, "reward_std": 0.011899751611053944, "rewards//mean": 0.78704833984375, "rewards//std": 0.027874423190951347, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2448, "grad_norm": 6.605255603790283, "kl": 0.9427262730896473, "learning_rate": 8.677261846192499e-07, "loss": 0.0943, "num_tokens": 10579888.0, "reward": 0.7578125, "reward_std": 0.009779705666005611, "rewards//mean": 0.7578125, "rewards//std": 0.027909226715564728, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.245, "grad_norm": 2.804797887802124, "kl": 1.780638962984085, "learning_rate": 8.675110934907204e-07, "loss": 0.1781, "num_tokens": 10588520.0, "reward": 0.77777099609375, "reward_std": 0.010743267834186554, "rewards//mean": 0.77777099609375, "rewards//std": 0.024823803454637527, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2452, "grad_norm": 1.7409158945083618, "kl": 1.267751183360815, "learning_rate": 8.672958543287666e-07, "loss": 0.1268, "num_tokens": 10597080.0, "reward": 0.77142333984375, "reward_std": 0.010439383797347546, "rewards//mean": 0.77142333984375, "rewards//std": 0.02546793781220913, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2454, "grad_norm": 1.2458970546722412, "kl": 1.0435888636857271, "learning_rate": 8.670804672200865e-07, "loss": 0.1044, "num_tokens": 10605704.0, "reward": 0.7398681640625, "reward_std": 0.005552348215132952, "rewards//mean": 0.7398681640625, "rewards//std": 0.025635965168476105, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2456, "grad_norm": 3.520935297012329, "kl": 0.8077416438609362, "learning_rate": 8.668649322514381e-07, "loss": 0.0808, "num_tokens": 10614352.0, "reward": 0.73052978515625, "reward_std": 0.006767407990992069, "rewards//mean": 0.73052978515625, "rewards//std": 0.02818979136645794, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2458, "grad_norm": 4.556258201599121, "kl": 1.0413630101829767, "learning_rate": 8.666492495096389e-07, "loss": 0.1041, "num_tokens": 10622968.0, "reward": 0.7535400390625, "reward_std": 0.00897935964167118, "rewards//mean": 0.7535400390625, "rewards//std": 0.03483942151069641, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.246, "grad_norm": 2.738713026046753, "kl": 1.480254141613841, "learning_rate": 8.664334190815659e-07, "loss": 0.148, "num_tokens": 10631624.0, "reward": 0.7525634765625, "reward_std": 0.010982338339090347, "rewards//mean": 0.7525634765625, "rewards//std": 0.028273237869143486, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2462, "grad_norm": 5.985865592956543, "kl": 2.037445917725563, "learning_rate": 8.662174410541554e-07, "loss": 0.2037, "num_tokens": 10640224.0, "reward": 0.7392578125, "reward_std": 0.012627032585442066, "rewards//mean": 0.7392578125, "rewards//std": 0.03356677293777466, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2464, "grad_norm": 3.7179629802703857, "kl": 1.844133771955967, "learning_rate": 8.660013155144035e-07, "loss": 0.1844, "num_tokens": 10648752.0, "reward": 0.72821044921875, "reward_std": 0.016244065016508102, "rewards//mean": 0.72821044921875, "rewards//std": 0.03532329574227333, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2466, "grad_norm": 4.169260501861572, "kl": 1.7794784530997276, "learning_rate": 8.657850425493654e-07, "loss": 0.1779, "num_tokens": 10657368.0, "reward": 0.746337890625, "reward_std": 0.01339347381144762, "rewards//mean": 0.746337890625, "rewards//std": 0.039841409772634506, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2468, "grad_norm": 3.695326328277588, "kl": 1.2684337049722672, "learning_rate": 8.65568622246156e-07, "loss": 0.1268, "num_tokens": 10666032.0, "reward": 0.762451171875, "reward_std": 0.008404484018683434, "rewards//mean": 0.762451171875, "rewards//std": 0.027285005897283554, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.247, "grad_norm": 2.6150856018066406, "kl": 0.7142418771982193, "learning_rate": 8.653520546919493e-07, "loss": 0.0714, "num_tokens": 10674592.0, "reward": 0.786865234375, "reward_std": 0.006299816071987152, "rewards//mean": 0.786865234375, "rewards//std": 0.03240573778748512, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2472, "grad_norm": 2.7588894367218018, "kl": 1.3413833174854517, "learning_rate": 8.651353399739787e-07, "loss": 0.1341, "num_tokens": 10683232.0, "reward": 0.74127197265625, "reward_std": 0.010000904090702534, "rewards//mean": 0.74127197265625, "rewards//std": 0.03273935988545418, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2474, "grad_norm": 6.649838447570801, "kl": 2.623996540904045, "learning_rate": 8.649184781795367e-07, "loss": 0.2624, "num_tokens": 10691880.0, "reward": 0.78399658203125, "reward_std": 0.021707233041524887, "rewards//mean": 0.78399658203125, "rewards//std": 0.03591403737664223, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2476, "grad_norm": 4.778037071228027, "kl": 1.628692101687193, "learning_rate": 8.647014693959753e-07, "loss": 0.1629, "num_tokens": 10700504.0, "reward": 0.725830078125, "reward_std": 0.01012200117111206, "rewards//mean": 0.725830078125, "rewards//std": 0.036410968750715256, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2478, "grad_norm": 3.7553439140319824, "kl": 1.1641744170337915, "learning_rate": 8.644843137107057e-07, "loss": 0.1164, "num_tokens": 10709176.0, "reward": 0.77886962890625, "reward_std": 0.007269697263836861, "rewards//mean": 0.77886962890625, "rewards//std": 0.023964976891875267, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.248, "grad_norm": 1.3512905836105347, "kl": 1.2719145566225052, "learning_rate": 8.642670112111981e-07, "loss": 0.1272, "num_tokens": 10717912.0, "reward": 0.76165771484375, "reward_std": 0.008655503392219543, "rewards//mean": 0.76165771484375, "rewards//std": 0.03598688170313835, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2482, "grad_norm": 1.5786871910095215, "kl": 0.9103036411106586, "learning_rate": 8.64049561984982e-07, "loss": 0.091, "num_tokens": 10726616.0, "reward": 0.75714111328125, "reward_std": 0.004460044205188751, "rewards//mean": 0.75714111328125, "rewards//std": 0.020190447568893433, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2484, "grad_norm": 2.169869899749756, "kl": 1.040619820356369, "learning_rate": 8.638319661196459e-07, "loss": 0.1041, "num_tokens": 10735200.0, "reward": 0.76641845703125, "reward_std": 0.007590742781758308, "rewards//mean": 0.76641845703125, "rewards//std": 0.02504781074821949, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2486, "grad_norm": 10.328278541564941, "kl": 2.630313467234373, "learning_rate": 8.636142237028372e-07, "loss": 0.263, "num_tokens": 10743880.0, "reward": 0.73651123046875, "reward_std": 0.01347922533750534, "rewards//mean": 0.73651123046875, "rewards//std": 0.03510405868291855, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2488, "grad_norm": 7.640003681182861, "kl": 2.4689796324819326, "learning_rate": 8.633963348222628e-07, "loss": 0.2469, "num_tokens": 10752560.0, "reward": 0.74615478515625, "reward_std": 0.010604561306536198, "rewards//mean": 0.74615478515625, "rewards//std": 0.036508072167634964, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.249, "grad_norm": 4.208217620849609, "kl": 1.5706639103591442, "learning_rate": 8.631782995656882e-07, "loss": 0.1571, "num_tokens": 10761192.0, "reward": 0.75830078125, "reward_std": 0.0143938809633255, "rewards//mean": 0.75830078125, "rewards//std": 0.03016548976302147, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2492, "grad_norm": 2.6180520057678223, "kl": 1.5482365787029266, "learning_rate": 8.62960118020938e-07, "loss": 0.1548, "num_tokens": 10769904.0, "reward": 0.71759033203125, "reward_std": 0.010162541642785072, "rewards//mean": 0.71759033203125, "rewards//std": 0.04660388082265854, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2494, "grad_norm": 2.4136457443237305, "kl": 1.4631451219320297, "learning_rate": 8.627417902758956e-07, "loss": 0.1463, "num_tokens": 10778632.0, "reward": 0.7861328125, "reward_std": 0.014890835620462894, "rewards//mean": 0.7861328125, "rewards//std": 0.02900712378323078, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2496, "grad_norm": 1.8261510133743286, "kl": 1.0705778319388628, "learning_rate": 8.625233164185034e-07, "loss": 0.1071, "num_tokens": 10787216.0, "reward": 0.751708984375, "reward_std": 0.005420691333711147, "rewards//mean": 0.751708984375, "rewards//std": 0.02442800998687744, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2498, "grad_norm": 1.898107647895813, "kl": 0.8983949609100819, "learning_rate": 8.623046965367628e-07, "loss": 0.0898, "num_tokens": 10795792.0, "reward": 0.7265625, "reward_std": 0.003153369063511491, "rewards//mean": 0.7265625, "rewards//std": 0.021156350150704384, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.25, "grad_norm": 3.469754934310913, "kl": 2.335224675014615, "learning_rate": 8.620859307187338e-07, "loss": 0.2335, "num_tokens": 10804400.0, "reward": 0.74237060546875, "reward_std": 0.012575688771903515, "rewards//mean": 0.74237060546875, "rewards//std": 0.02802553027868271, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2502, "grad_norm": 3.4656877517700195, "kl": 1.8427002094686031, "learning_rate": 8.61867019052535e-07, "loss": 0.1843, "num_tokens": 10813208.0, "reward": 0.75823974609375, "reward_std": 0.011305361986160278, "rewards//mean": 0.75823974609375, "rewards//std": 0.03339400514960289, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2504, "grad_norm": 2.106476068496704, "kl": 2.1087175458669662, "learning_rate": 8.616479616263444e-07, "loss": 0.2109, "num_tokens": 10821848.0, "reward": 0.75262451171875, "reward_std": 0.013109234161674976, "rewards//mean": 0.75262451171875, "rewards//std": 0.03394743800163269, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2506, "grad_norm": 2.731506586074829, "kl": 0.6706829220056534, "learning_rate": 8.61428758528398e-07, "loss": 0.0671, "num_tokens": 10830376.0, "reward": 0.7547607421875, "reward_std": 0.004223778378218412, "rewards//mean": 0.7547607421875, "rewards//std": 0.01751730777323246, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2508, "grad_norm": 2.818578004837036, "kl": 1.552479200065136, "learning_rate": 8.612094098469909e-07, "loss": 0.1552, "num_tokens": 10838992.0, "reward": 0.77081298828125, "reward_std": 0.013558020815253258, "rewards//mean": 0.77081298828125, "rewards//std": 0.029463233426213264, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.251, "grad_norm": 6.754956245422363, "kl": 1.914738615974784, "learning_rate": 8.609899156704767e-07, "loss": 0.1915, "num_tokens": 10847728.0, "reward": 0.7509765625, "reward_std": 0.011311469599604607, "rewards//mean": 0.7509765625, "rewards//std": 0.03421001136302948, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2512, "grad_norm": 5.389358043670654, "kl": 2.6926444843411446, "learning_rate": 8.607702760872677e-07, "loss": 0.2693, "num_tokens": 10856544.0, "reward": 0.776611328125, "reward_std": 0.01727277785539627, "rewards//mean": 0.776611328125, "rewards//std": 0.05261942744255066, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2514, "grad_norm": 2.313718557357788, "kl": 1.280439605936408, "learning_rate": 8.605504911858346e-07, "loss": 0.128, "num_tokens": 10865104.0, "reward": 0.7525634765625, "reward_std": 0.006848743185400963, "rewards//mean": 0.7525634765625, "rewards//std": 0.02454022504389286, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2516, "grad_norm": 1.7476269006729126, "kl": 1.4869457203894854, "learning_rate": 8.603305610547069e-07, "loss": 0.1487, "num_tokens": 10873816.0, "reward": 0.7633056640625, "reward_std": 0.010295258834958076, "rewards//mean": 0.7633056640625, "rewards//std": 0.04068861901760101, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2518, "grad_norm": 2.303586721420288, "kl": 1.176751121878624, "learning_rate": 8.601104857824722e-07, "loss": 0.1177, "num_tokens": 10882440.0, "reward": 0.79437255859375, "reward_std": 0.006195859983563423, "rewards//mean": 0.79437255859375, "rewards//std": 0.030515672639012337, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.252, "grad_norm": 2.7836127281188965, "kl": 1.0767667312175035, "learning_rate": 8.598902654577768e-07, "loss": 0.1077, "num_tokens": 10891048.0, "reward": 0.7734375, "reward_std": 0.01202109083533287, "rewards//mean": 0.7734375, "rewards//std": 0.0266120582818985, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2522, "grad_norm": 3.5508615970611572, "kl": 1.8393850270658731, "learning_rate": 8.596699001693255e-07, "loss": 0.1839, "num_tokens": 10899720.0, "reward": 0.750244140625, "reward_std": 0.015253997407853603, "rewards//mean": 0.750244140625, "rewards//std": 0.034610338509082794, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2524, "grad_norm": 5.652132034301758, "kl": 1.8502464350312948, "learning_rate": 8.594493900058816e-07, "loss": 0.185, "num_tokens": 10908304.0, "reward": 0.766845703125, "reward_std": 0.01523201446980238, "rewards//mean": 0.766845703125, "rewards//std": 0.052180320024490356, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2526, "grad_norm": 3.7884795665740967, "kl": 1.8132641948759556, "learning_rate": 8.592287350562663e-07, "loss": 0.1813, "num_tokens": 10916880.0, "reward": 0.782958984375, "reward_std": 0.010780639946460724, "rewards//mean": 0.782958984375, "rewards//std": 0.033326905220746994, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2528, "grad_norm": 3.681049346923828, "kl": 0.9148434549570084, "learning_rate": 8.590079354093593e-07, "loss": 0.0915, "num_tokens": 10925520.0, "reward": 0.76666259765625, "reward_std": 0.0037816944532096386, "rewards//mean": 0.76666259765625, "rewards//std": 0.019981687888503075, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.253, "grad_norm": 0.6351627707481384, "kl": 0.7859466709196568, "learning_rate": 8.587869911540992e-07, "loss": 0.0786, "num_tokens": 10934168.0, "reward": 0.75091552734375, "reward_std": 0.00666106166318059, "rewards//mean": 0.75091552734375, "rewards//std": 0.02945244126021862, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2532, "grad_norm": 1.621062159538269, "kl": 2.01824108697474, "learning_rate": 8.585659023794818e-07, "loss": 0.2018, "num_tokens": 10942832.0, "reward": 0.745361328125, "reward_std": 0.016542179509997368, "rewards//mean": 0.745361328125, "rewards//std": 0.036939289420843124, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2534, "grad_norm": 5.1790771484375, "kl": 0.8341042678803205, "learning_rate": 8.583446691745617e-07, "loss": 0.0834, "num_tokens": 10951568.0, "reward": 0.7872314453125, "reward_std": 0.008905846625566483, "rewards//mean": 0.7872314453125, "rewards//std": 0.020646117627620697, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2536, "grad_norm": 2.450307607650757, "kl": 0.867433724924922, "learning_rate": 8.581232916284517e-07, "loss": 0.0867, "num_tokens": 10960248.0, "reward": 0.75677490234375, "reward_std": 0.0065104844979941845, "rewards//mean": 0.75677490234375, "rewards//std": 0.02819623425602913, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2538, "grad_norm": 2.024763584136963, "kl": 1.301382478326559, "learning_rate": 8.579017698303228e-07, "loss": 0.1301, "num_tokens": 10968904.0, "reward": 0.75848388671875, "reward_std": 0.008169629611074924, "rewards//mean": 0.75848388671875, "rewards//std": 0.03370039165019989, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.254, "grad_norm": 4.69452428817749, "kl": 1.8262019213289022, "learning_rate": 8.576801038694039e-07, "loss": 0.1826, "num_tokens": 10977560.0, "reward": 0.76617431640625, "reward_std": 0.013292655348777771, "rewards//mean": 0.76617431640625, "rewards//std": 0.03792916610836983, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2542, "grad_norm": 1.103826642036438, "kl": 0.6330838054418564, "learning_rate": 8.574582938349817e-07, "loss": 0.0633, "num_tokens": 10986264.0, "reward": 0.767578125, "reward_std": 0.0022538788616657257, "rewards//mean": 0.767578125, "rewards//std": 0.016977090388536453, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2544, "grad_norm": 1.8948676586151123, "kl": 0.7893429528921843, "learning_rate": 8.572363398164016e-07, "loss": 0.0789, "num_tokens": 10994872.0, "reward": 0.76055908203125, "reward_std": 0.003600158728659153, "rewards//mean": 0.76055908203125, "rewards//std": 0.023915022611618042, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2546, "grad_norm": 4.184702396392822, "kl": 1.00759750418365, "learning_rate": 8.570142419030666e-07, "loss": 0.1008, "num_tokens": 11003560.0, "reward": 0.76800537109375, "reward_std": 0.008670274168252945, "rewards//mean": 0.76800537109375, "rewards//std": 0.030358511954545975, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2548, "grad_norm": 2.8754255771636963, "kl": 1.1997501738369465, "learning_rate": 8.567920001844375e-07, "loss": 0.12, "num_tokens": 11012160.0, "reward": 0.7510986328125, "reward_std": 0.010766448453068733, "rewards//mean": 0.7510986328125, "rewards//std": 0.031791478395462036, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.255, "grad_norm": 9.786314964294434, "kl": 2.5837177895009518, "learning_rate": 8.565696147500337e-07, "loss": 0.2584, "num_tokens": 11020816.0, "reward": 0.74859619140625, "reward_std": 0.017663855105638504, "rewards//mean": 0.74859619140625, "rewards//std": 0.041773296892642975, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2552, "grad_norm": 2.8679399490356445, "kl": 1.023074833676219, "learning_rate": 8.563470856894314e-07, "loss": 0.1023, "num_tokens": 11029440.0, "reward": 0.76708984375, "reward_std": 0.008393588475883007, "rewards//mean": 0.76708984375, "rewards//std": 0.01991196535527706, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2554, "grad_norm": 4.652540683746338, "kl": 0.9408929571509361, "learning_rate": 8.561244130922657e-07, "loss": 0.0941, "num_tokens": 11038104.0, "reward": 0.7435302734375, "reward_std": 0.004335008095949888, "rewards//mean": 0.7435302734375, "rewards//std": 0.03208533674478531, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2556, "grad_norm": 6.747488975524902, "kl": 2.049085784703493, "learning_rate": 8.559015970482291e-07, "loss": 0.2049, "num_tokens": 11046760.0, "reward": 0.74151611328125, "reward_std": 0.010567471385002136, "rewards//mean": 0.74151611328125, "rewards//std": 0.044893182814121246, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2558, "grad_norm": 3.9553515911102295, "kl": 1.5856800060719252, "learning_rate": 8.556786376470716e-07, "loss": 0.1586, "num_tokens": 11055496.0, "reward": 0.77276611328125, "reward_std": 0.0153457997366786, "rewards//mean": 0.77276611328125, "rewards//std": 0.03472686558961868, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.256, "grad_norm": 3.669755220413208, "kl": 1.9961835369467735, "learning_rate": 8.554555349786015e-07, "loss": 0.1996, "num_tokens": 11064112.0, "reward": 0.79107666015625, "reward_std": 0.01941429078578949, "rewards//mean": 0.79107666015625, "rewards//std": 0.034209955483675, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2562, "grad_norm": 3.585503101348877, "kl": 1.9079175908118486, "learning_rate": 8.552322891326844e-07, "loss": 0.1908, "num_tokens": 11072752.0, "reward": 0.76318359375, "reward_std": 0.011368101462721825, "rewards//mean": 0.76318359375, "rewards//std": 0.017723919823765755, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2564, "grad_norm": 3.162195920944214, "kl": 1.5276030581444502, "learning_rate": 8.550089001992437e-07, "loss": 0.1528, "num_tokens": 11081424.0, "reward": 0.74066162109375, "reward_std": 0.010886474512517452, "rewards//mean": 0.74066162109375, "rewards//std": 0.041826896369457245, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2566, "grad_norm": 4.8460798263549805, "kl": 1.4931025095283985, "learning_rate": 8.547853682682604e-07, "loss": 0.1493, "num_tokens": 11089952.0, "reward": 0.77691650390625, "reward_std": 0.010478938929736614, "rewards//mean": 0.77691650390625, "rewards//std": 0.024060796946287155, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2568, "grad_norm": 1.2930303812026978, "kl": 0.787976648658514, "learning_rate": 8.545616934297733e-07, "loss": 0.0788, "num_tokens": 11098544.0, "reward": 0.79632568359375, "reward_std": 0.004147297702729702, "rewards//mean": 0.79632568359375, "rewards//std": 0.02091301791369915, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.257, "grad_norm": 1.154871940612793, "kl": 1.2250561993569136, "learning_rate": 8.543378757738784e-07, "loss": 0.1225, "num_tokens": 11107168.0, "reward": 0.7586669921875, "reward_std": 0.006589522585272789, "rewards//mean": 0.7586669921875, "rewards//std": 0.039533793926239014, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2572, "grad_norm": 1.7693229913711548, "kl": 1.31552180275321, "learning_rate": 8.541139153907295e-07, "loss": 0.1316, "num_tokens": 11115752.0, "reward": 0.75714111328125, "reward_std": 0.007101314142346382, "rewards//mean": 0.75714111328125, "rewards//std": 0.022418692708015442, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2574, "grad_norm": 2.711568593978882, "kl": 1.3092481419444084, "learning_rate": 8.538898123705379e-07, "loss": 0.1309, "num_tokens": 11124360.0, "reward": 0.7662353515625, "reward_std": 0.009426334872841835, "rewards//mean": 0.7662353515625, "rewards//std": 0.03132136911153793, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2576, "grad_norm": 1.8225528001785278, "kl": 1.1906631663441658, "learning_rate": 8.536655668035721e-07, "loss": 0.1191, "num_tokens": 11133016.0, "reward": 0.76165771484375, "reward_std": 0.0082542160525918, "rewards//mean": 0.76165771484375, "rewards//std": 0.035791173577308655, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2578, "grad_norm": 3.3898680210113525, "kl": 1.274760453030467, "learning_rate": 8.534411787801586e-07, "loss": 0.1275, "num_tokens": 11141656.0, "reward": 0.7720947265625, "reward_std": 0.008762829937040806, "rewards//mean": 0.7720947265625, "rewards//std": 0.03438809514045715, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.258, "grad_norm": 5.22819709777832, "kl": 1.657284589484334, "learning_rate": 8.532166483906802e-07, "loss": 0.1657, "num_tokens": 11150280.0, "reward": 0.741455078125, "reward_std": 0.009275168180465698, "rewards//mean": 0.741455078125, "rewards//std": 0.03468024730682373, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2582, "grad_norm": 7.5085835456848145, "kl": 2.815685471519828, "learning_rate": 8.529919757255781e-07, "loss": 0.2816, "num_tokens": 11158864.0, "reward": 0.7469482421875, "reward_std": 0.017527610063552856, "rewards//mean": 0.7469482421875, "rewards//std": 0.042543720453977585, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2584, "grad_norm": 2.104673385620117, "kl": 1.5815368201583624, "learning_rate": 8.527671608753506e-07, "loss": 0.1582, "num_tokens": 11167488.0, "reward": 0.76043701171875, "reward_std": 0.00821172446012497, "rewards//mean": 0.76043701171875, "rewards//std": 0.028485199436545372, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2586, "grad_norm": 4.024179458618164, "kl": 1.4441726431250572, "learning_rate": 8.525422039305528e-07, "loss": 0.1444, "num_tokens": 11176208.0, "reward": 0.7763671875, "reward_std": 0.015655819326639175, "rewards//mean": 0.7763671875, "rewards//std": 0.03253347799181938, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2588, "grad_norm": 2.9449851512908936, "kl": 1.3701546844094992, "learning_rate": 8.523171049817973e-07, "loss": 0.137, "num_tokens": 11184800.0, "reward": 0.77728271484375, "reward_std": 0.015028866939246655, "rewards//mean": 0.77728271484375, "rewards//std": 0.03298075124621391, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.259, "grad_norm": 2.749448776245117, "kl": 0.8904798720031977, "learning_rate": 8.520918641197541e-07, "loss": 0.089, "num_tokens": 11193456.0, "reward": 0.7777099609375, "reward_std": 0.006039226893335581, "rewards//mean": 0.7777099609375, "rewards//std": 0.0273013673722744, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2592, "grad_norm": 2.463488817214966, "kl": 0.6949389223009348, "learning_rate": 8.518664814351502e-07, "loss": 0.0695, "num_tokens": 11202144.0, "reward": 0.7506103515625, "reward_std": 0.004664257634431124, "rewards//mean": 0.7506103515625, "rewards//std": 0.028058256953954697, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2594, "grad_norm": 3.9094603061676025, "kl": 1.8501922711730003, "learning_rate": 8.516409570187696e-07, "loss": 0.185, "num_tokens": 11210904.0, "reward": 0.75457763671875, "reward_std": 0.01569659821689129, "rewards//mean": 0.75457763671875, "rewards//std": 0.03982730209827423, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2596, "grad_norm": 4.6628098487854, "kl": 1.4436557963490486, "learning_rate": 8.514152909614535e-07, "loss": 0.1444, "num_tokens": 11219616.0, "reward": 0.75396728515625, "reward_std": 0.01311071589589119, "rewards//mean": 0.75396728515625, "rewards//std": 0.029793795198202133, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2598, "grad_norm": 3.954066514968872, "kl": 1.4453269317746162, "learning_rate": 8.511894833541005e-07, "loss": 0.1445, "num_tokens": 11228280.0, "reward": 0.74578857421875, "reward_std": 0.007162814028561115, "rewards//mean": 0.74578857421875, "rewards//std": 0.0248158760368824, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.26, "grad_norm": 1.8072354793548584, "kl": 0.7664232403039932, "learning_rate": 8.509635342876654e-07, "loss": 0.0766, "num_tokens": 11236848.0, "reward": 0.75201416015625, "reward_std": 0.0031141305807977915, "rewards//mean": 0.75201416015625, "rewards//std": 0.02588300220668316, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2602, "grad_norm": 5.837655544281006, "kl": 2.054176390171051, "learning_rate": 8.507374438531606e-07, "loss": 0.2054, "num_tokens": 11245440.0, "reward": 0.75836181640625, "reward_std": 0.01429401058703661, "rewards//mean": 0.75836181640625, "rewards//std": 0.04007170721888542, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2604, "grad_norm": 9.02835464477539, "kl": 1.941506065428257, "learning_rate": 8.505112121416553e-07, "loss": 0.1942, "num_tokens": 11254128.0, "reward": 0.73370361328125, "reward_std": 0.010103583335876465, "rewards//mean": 0.73370361328125, "rewards//std": 0.024847574532032013, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2606, "grad_norm": 3.456326961517334, "kl": 1.1359886340796947, "learning_rate": 8.502848392442758e-07, "loss": 0.1136, "num_tokens": 11262768.0, "reward": 0.78424072265625, "reward_std": 0.014487136155366898, "rewards//mean": 0.78424072265625, "rewards//std": 0.03456038609147072, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2608, "grad_norm": 0.8553781509399414, "kl": 0.812415087595582, "learning_rate": 8.500583252522052e-07, "loss": 0.0812, "num_tokens": 11271360.0, "reward": 0.76873779296875, "reward_std": 0.0066340104676783085, "rewards//mean": 0.76873779296875, "rewards//std": 0.03270559012889862, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.261, "grad_norm": 2.3737709522247314, "kl": 1.4914542753249407, "learning_rate": 8.498316702566826e-07, "loss": 0.1491, "num_tokens": 11279992.0, "reward": 0.76104736328125, "reward_std": 0.010477501899003983, "rewards//mean": 0.76104736328125, "rewards//std": 0.020417097955942154, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2612, "grad_norm": 10.065834045410156, "kl": 1.307991225272417, "learning_rate": 8.496048743490053e-07, "loss": 0.1308, "num_tokens": 11288736.0, "reward": 0.76806640625, "reward_std": 0.00657601747661829, "rewards//mean": 0.76806640625, "rewards//std": 0.02585030160844326, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2614, "grad_norm": 2.43906307220459, "kl": 0.7377323266118765, "learning_rate": 8.493779376205264e-07, "loss": 0.0738, "num_tokens": 11297368.0, "reward": 0.7821044921875, "reward_std": 0.006939433049410582, "rewards//mean": 0.7821044921875, "rewards//std": 0.027930641546845436, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2616, "grad_norm": 2.145319938659668, "kl": 1.0703008230775595, "learning_rate": 8.491508601626561e-07, "loss": 0.107, "num_tokens": 11306032.0, "reward": 0.7578125, "reward_std": 0.006287199445068836, "rewards//mean": 0.7578125, "rewards//std": 0.03166576102375984, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2618, "grad_norm": 11.946181297302246, "kl": 2.5898921005427837, "learning_rate": 8.489236420668608e-07, "loss": 0.259, "num_tokens": 11314848.0, "reward": 0.74591064453125, "reward_std": 0.012224599719047546, "rewards//mean": 0.74591064453125, "rewards//std": 0.031074577942490578, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.262, "grad_norm": 1.9521502256393433, "kl": 1.122292123734951, "learning_rate": 8.486962834246645e-07, "loss": 0.1122, "num_tokens": 11323456.0, "reward": 0.73980712890625, "reward_std": 0.0033526255283504725, "rewards//mean": 0.73980712890625, "rewards//std": 0.028353635221719742, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2622, "grad_norm": 4.476296424865723, "kl": 1.1261905822902918, "learning_rate": 8.484687843276468e-07, "loss": 0.1126, "num_tokens": 11332048.0, "reward": 0.75634765625, "reward_std": 0.004982992075383663, "rewards//mean": 0.75634765625, "rewards//std": 0.030012547969818115, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2624, "grad_norm": 2.3960392475128174, "kl": 1.8680939860641956, "learning_rate": 8.482411448674445e-07, "loss": 0.1868, "num_tokens": 11340712.0, "reward": 0.7462158203125, "reward_std": 0.010261274874210358, "rewards//mean": 0.7462158203125, "rewards//std": 0.03507671877741814, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2626, "grad_norm": 6.032341480255127, "kl": 1.5083320494741201, "learning_rate": 8.480133651357505e-07, "loss": 0.1508, "num_tokens": 11349392.0, "reward": 0.74969482421875, "reward_std": 0.007270030677318573, "rewards//mean": 0.74969482421875, "rewards//std": 0.030459566041827202, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2628, "grad_norm": 2.41402268409729, "kl": 2.0536234378814697, "learning_rate": 8.477854452243147e-07, "loss": 0.2054, "num_tokens": 11358080.0, "reward": 0.78143310546875, "reward_std": 0.01560906134545803, "rewards//mean": 0.78143310546875, "rewards//std": 0.030140826478600502, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.263, "grad_norm": 1.914323091506958, "kl": 1.3961933478713036, "learning_rate": 8.475573852249434e-07, "loss": 0.1396, "num_tokens": 11366696.0, "reward": 0.74169921875, "reward_std": 0.006042609456926584, "rewards//mean": 0.74169921875, "rewards//std": 0.030221641063690186, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2632, "grad_norm": 3.4490444660186768, "kl": 1.9582337886095047, "learning_rate": 8.473291852294986e-07, "loss": 0.1958, "num_tokens": 11375288.0, "reward": 0.7637939453125, "reward_std": 0.01188709493726492, "rewards//mean": 0.7637939453125, "rewards//std": 0.024596910923719406, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2634, "grad_norm": 3.589838981628418, "kl": 1.6921525243669748, "learning_rate": 8.471008453298996e-07, "loss": 0.1692, "num_tokens": 11383904.0, "reward": 0.775390625, "reward_std": 0.0128225889056921, "rewards//mean": 0.775390625, "rewards//std": 0.03313101455569267, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2636, "grad_norm": 2.3534553050994873, "kl": 1.923955136910081, "learning_rate": 8.468723656181218e-07, "loss": 0.1924, "num_tokens": 11392448.0, "reward": 0.75640869140625, "reward_std": 0.012626361101865768, "rewards//mean": 0.75640869140625, "rewards//std": 0.02944113127887249, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2638, "grad_norm": 9.595563888549805, "kl": 1.5738086681813002, "learning_rate": 8.466437461861964e-07, "loss": 0.1574, "num_tokens": 11401088.0, "reward": 0.773193359375, "reward_std": 0.01257226150482893, "rewards//mean": 0.773193359375, "rewards//std": 0.02903946116566658, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.264, "grad_norm": 7.869680404663086, "kl": 1.9713663961738348, "learning_rate": 8.464149871262116e-07, "loss": 0.1971, "num_tokens": 11409840.0, "reward": 0.747802734375, "reward_std": 0.007948263548314571, "rewards//mean": 0.747802734375, "rewards//std": 0.03971964120864868, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2642, "grad_norm": 1.781855821609497, "kl": 1.3266140967607498, "learning_rate": 8.461860885303113e-07, "loss": 0.1327, "num_tokens": 11418512.0, "reward": 0.7415771484375, "reward_std": 0.005644769407808781, "rewards//mean": 0.7415771484375, "rewards//std": 0.02440165914595127, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2644, "grad_norm": 2.119027853012085, "kl": 1.2260348349809647, "learning_rate": 8.459570504906961e-07, "loss": 0.1226, "num_tokens": 11427168.0, "reward": 0.75335693359375, "reward_std": 0.011243656277656555, "rewards//mean": 0.75335693359375, "rewards//std": 0.023590637370944023, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2646, "grad_norm": 5.476919651031494, "kl": 1.813824001699686, "learning_rate": 8.457278730996222e-07, "loss": 0.1814, "num_tokens": 11435776.0, "reward": 0.79547119140625, "reward_std": 0.01277064997702837, "rewards//mean": 0.79547119140625, "rewards//std": 0.028351498767733574, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2648, "grad_norm": 4.00071907043457, "kl": 2.1896361093968153, "learning_rate": 8.454985564494024e-07, "loss": 0.219, "num_tokens": 11444424.0, "reward": 0.7470703125, "reward_std": 0.016677251085639, "rewards//mean": 0.7470703125, "rewards//std": 0.03972649946808815, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.265, "grad_norm": 8.978479385375977, "kl": 3.264270981773734, "learning_rate": 8.452691006324054e-07, "loss": 0.3264, "num_tokens": 11453048.0, "reward": 0.7388916015625, "reward_std": 0.01705537736415863, "rewards//mean": 0.7388916015625, "rewards//std": 0.04073323681950569, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2652, "grad_norm": 6.3131914138793945, "kl": 2.1231815684586763, "learning_rate": 8.45039505741056e-07, "loss": 0.2123, "num_tokens": 11461712.0, "reward": 0.74981689453125, "reward_std": 0.005183476489037275, "rewards//mean": 0.74981689453125, "rewards//std": 0.036765486001968384, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2654, "grad_norm": 4.836770534515381, "kl": 1.5332971066236496, "learning_rate": 8.448097718678348e-07, "loss": 0.1533, "num_tokens": 11470416.0, "reward": 0.745849609375, "reward_std": 0.011320450343191624, "rewards//mean": 0.745849609375, "rewards//std": 0.03728518262505531, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2656, "grad_norm": 6.643113136291504, "kl": 2.7718011625111103, "learning_rate": 8.44579899105279e-07, "loss": 0.2772, "num_tokens": 11479032.0, "reward": 0.74407958984375, "reward_std": 0.01326703280210495, "rewards//mean": 0.74407958984375, "rewards//std": 0.029495583847165108, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2658, "grad_norm": 2.29469895362854, "kl": 1.0763975717127323, "learning_rate": 8.443498875459808e-07, "loss": 0.1076, "num_tokens": 11487568.0, "reward": 0.72308349609375, "reward_std": 0.010130094364285469, "rewards//mean": 0.72308349609375, "rewards//std": 0.03735165670514107, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.266, "grad_norm": 2.7798666954040527, "kl": 1.92560656927526, "learning_rate": 8.441197372825892e-07, "loss": 0.1926, "num_tokens": 11496160.0, "reward": 0.7310791015625, "reward_std": 0.013944422826170921, "rewards//mean": 0.7310791015625, "rewards//std": 0.0374300442636013, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2662, "grad_norm": 4.736409664154053, "kl": 1.1363797690719366, "learning_rate": 8.438894484078085e-07, "loss": 0.1136, "num_tokens": 11504792.0, "reward": 0.73931884765625, "reward_std": 0.007724676746875048, "rewards//mean": 0.73931884765625, "rewards//std": 0.029308177530765533, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2664, "grad_norm": 3.171586275100708, "kl": 2.3072773162275553, "learning_rate": 8.43659021014399e-07, "loss": 0.2307, "num_tokens": 11513432.0, "reward": 0.75714111328125, "reward_std": 0.015399273484945297, "rewards//mean": 0.75714111328125, "rewards//std": 0.023984549567103386, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2666, "grad_norm": 2.667379856109619, "kl": 1.5344881527125835, "learning_rate": 8.434284551951772e-07, "loss": 0.1534, "num_tokens": 11522112.0, "reward": 0.77593994140625, "reward_std": 0.01357704121619463, "rewards//mean": 0.77593994140625, "rewards//std": 0.029375243932008743, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2668, "grad_norm": 2.44822359085083, "kl": 1.4912556074559689, "learning_rate": 8.431977510430145e-07, "loss": 0.1491, "num_tokens": 11530648.0, "reward": 0.72918701171875, "reward_std": 0.014410671778023243, "rewards//mean": 0.72918701171875, "rewards//std": 0.041754089295864105, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.267, "grad_norm": 3.6548104286193848, "kl": 1.5616823472082615, "learning_rate": 8.429669086508389e-07, "loss": 0.1562, "num_tokens": 11539376.0, "reward": 0.7783203125, "reward_std": 0.02049800381064415, "rewards//mean": 0.7783203125, "rewards//std": 0.04042753949761391, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2672, "grad_norm": 4.781790733337402, "kl": 1.5930482968688011, "learning_rate": 8.427359281116333e-07, "loss": 0.1593, "num_tokens": 11548024.0, "reward": 0.7432861328125, "reward_std": 0.011560136452317238, "rewards//mean": 0.7432861328125, "rewards//std": 0.0274319089949131, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2674, "grad_norm": 5.124735355377197, "kl": 1.7172706872224808, "learning_rate": 8.42504809518437e-07, "loss": 0.1717, "num_tokens": 11556656.0, "reward": 0.75689697265625, "reward_std": 0.012276984751224518, "rewards//mean": 0.75689697265625, "rewards//std": 0.02750485949218273, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2676, "grad_norm": 7.540124416351318, "kl": 1.4192634783685207, "learning_rate": 8.422735529643443e-07, "loss": 0.1419, "num_tokens": 11565296.0, "reward": 0.76983642578125, "reward_std": 0.008768022060394287, "rewards//mean": 0.76983642578125, "rewards//std": 0.025960668921470642, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2678, "grad_norm": 4.039872169494629, "kl": 1.52224126085639, "learning_rate": 8.420421585425055e-07, "loss": 0.1522, "num_tokens": 11573864.0, "reward": 0.739013671875, "reward_std": 0.011027004569768906, "rewards//mean": 0.739013671875, "rewards//std": 0.033275991678237915, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.268, "grad_norm": 7.840274810791016, "kl": 1.426694292575121, "learning_rate": 8.41810626346126e-07, "loss": 0.1427, "num_tokens": 11582424.0, "reward": 0.7435302734375, "reward_std": 0.006033569574356079, "rewards//mean": 0.7435302734375, "rewards//std": 0.03037925995886326, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2682, "grad_norm": 41.64866256713867, "kl": 0.9570504669100046, "learning_rate": 8.415789564684673e-07, "loss": 0.0957, "num_tokens": 11590968.0, "reward": 0.793212890625, "reward_std": 0.008649616502225399, "rewards//mean": 0.793212890625, "rewards//std": 0.018556727096438408, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2684, "grad_norm": 5.264158725738525, "kl": 1.6046327948570251, "learning_rate": 8.413471490028455e-07, "loss": 0.1605, "num_tokens": 11599640.0, "reward": 0.77490234375, "reward_std": 0.011650312691926956, "rewards//mean": 0.77490234375, "rewards//std": 0.024404451251029968, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2686, "grad_norm": 17.011680603027344, "kl": 1.7282491251826286, "learning_rate": 8.41115204042633e-07, "loss": 0.1728, "num_tokens": 11608232.0, "reward": 0.7513427734375, "reward_std": 0.009822498075664043, "rewards//mean": 0.7513427734375, "rewards//std": 0.03263173997402191, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2688, "grad_norm": 5.446418762207031, "kl": 1.3230797797441483, "learning_rate": 8.408831216812573e-07, "loss": 0.1323, "num_tokens": 11616920.0, "reward": 0.7596435546875, "reward_std": 0.008739085868000984, "rewards//mean": 0.7596435546875, "rewards//std": 0.04049918055534363, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.269, "grad_norm": 41.21394348144531, "kl": 2.531970787793398, "learning_rate": 8.406509020122008e-07, "loss": 0.2532, "num_tokens": 11625560.0, "reward": 0.78192138671875, "reward_std": 0.00816678162664175, "rewards//mean": 0.78192138671875, "rewards//std": 0.02970474772155285, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2692, "grad_norm": 31.490951538085938, "kl": 2.656165039166808, "learning_rate": 8.404185451290017e-07, "loss": 0.2656, "num_tokens": 11634256.0, "reward": 0.759765625, "reward_std": 0.014978468418121338, "rewards//mean": 0.759765625, "rewards//std": 0.042722851037979126, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2694, "grad_norm": 43.99723815917969, "kl": 2.4182144086807966, "learning_rate": 8.401860511252533e-07, "loss": 0.2418, "num_tokens": 11642840.0, "reward": 0.7490234375, "reward_std": 0.01979096420109272, "rewards//mean": 0.7490234375, "rewards//std": 0.03907490149140358, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2696, "grad_norm": 21.77730941772461, "kl": 1.8901789207011461, "learning_rate": 8.399534200946043e-07, "loss": 0.189, "num_tokens": 11651408.0, "reward": 0.74652099609375, "reward_std": 0.014805897139012814, "rewards//mean": 0.74652099609375, "rewards//std": 0.03428862988948822, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2698, "grad_norm": 11.703360557556152, "kl": 2.190919779241085, "learning_rate": 8.397206521307583e-07, "loss": 0.2191, "num_tokens": 11660048.0, "reward": 0.75946044921875, "reward_std": 0.005028828978538513, "rewards//mean": 0.75946044921875, "rewards//std": 0.02802877128124237, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.27, "grad_norm": 1.4796116352081299, "kl": 1.4740431364625692, "learning_rate": 8.394877473274741e-07, "loss": 0.1474, "num_tokens": 11668728.0, "reward": 0.74029541015625, "reward_std": 0.006811236031353474, "rewards//mean": 0.74029541015625, "rewards//std": 0.032652318477630615, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2702, "grad_norm": 3.8744845390319824, "kl": 1.435689877718687, "learning_rate": 8.392547057785661e-07, "loss": 0.1436, "num_tokens": 11677376.0, "reward": 0.7760009765625, "reward_std": 0.015611299313604832, "rewards//mean": 0.7760009765625, "rewards//std": 0.030098924413323402, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2704, "grad_norm": 6.678821563720703, "kl": 1.6774704921990633, "learning_rate": 8.39021527577903e-07, "loss": 0.1677, "num_tokens": 11686016.0, "reward": 0.7735595703125, "reward_std": 0.017164621502161026, "rewards//mean": 0.7735595703125, "rewards//std": 0.03084409609436989, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2706, "grad_norm": 2.4394986629486084, "kl": 1.917915841564536, "learning_rate": 8.387882128194092e-07, "loss": 0.1918, "num_tokens": 11694712.0, "reward": 0.7869873046875, "reward_std": 0.016864636912941933, "rewards//mean": 0.7869873046875, "rewards//std": 0.03363863006234169, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2708, "grad_norm": 2.828508138656616, "kl": 1.0346383973956108, "learning_rate": 8.385547615970638e-07, "loss": 0.1035, "num_tokens": 11703384.0, "reward": 0.7320556640625, "reward_std": 0.008335303515195847, "rewards//mean": 0.7320556640625, "rewards//std": 0.02160627581179142, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.271, "grad_norm": 3.749871015548706, "kl": 1.5432432442903519, "learning_rate": 8.38321174004901e-07, "loss": 0.1543, "num_tokens": 11712016.0, "reward": 0.7733154296875, "reward_std": 0.012710566632449627, "rewards//mean": 0.7733154296875, "rewards//std": 0.02752446010708809, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2712, "grad_norm": 2.655707597732544, "kl": 1.4231681134551764, "learning_rate": 8.380874501370097e-07, "loss": 0.1423, "num_tokens": 11720648.0, "reward": 0.7662353515625, "reward_std": 0.010058829560875893, "rewards//mean": 0.7662353515625, "rewards//std": 0.030403168871998787, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2714, "grad_norm": 8.475590705871582, "kl": 1.1434898935258389, "learning_rate": 8.378535900875338e-07, "loss": 0.1143, "num_tokens": 11729304.0, "reward": 0.73297119140625, "reward_std": 0.013280518352985382, "rewards//mean": 0.73297119140625, "rewards//std": 0.032018449157476425, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2716, "grad_norm": 5.44661283493042, "kl": 1.1054871659725904, "learning_rate": 8.376195939506725e-07, "loss": 0.1105, "num_tokens": 11738040.0, "reward": 0.734619140625, "reward_std": 0.008744160644710064, "rewards//mean": 0.734619140625, "rewards//std": 0.031225770711898804, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2718, "grad_norm": 11.936436653137207, "kl": 2.7080367766320705, "learning_rate": 8.373854618206789e-07, "loss": 0.2708, "num_tokens": 11746752.0, "reward": 0.72198486328125, "reward_std": 0.01292148232460022, "rewards//mean": 0.72198486328125, "rewards//std": 0.04071144387125969, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.272, "grad_norm": 2.589662551879883, "kl": 1.080529686063528, "learning_rate": 8.371511937918617e-07, "loss": 0.1081, "num_tokens": 11755448.0, "reward": 0.72021484375, "reward_std": 0.007460972294211388, "rewards//mean": 0.72021484375, "rewards//std": 0.03240293264389038, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2722, "grad_norm": 2.6604063510894775, "kl": 1.9544231854379177, "learning_rate": 8.369167899585839e-07, "loss": 0.1954, "num_tokens": 11764152.0, "reward": 0.759033203125, "reward_std": 0.012583325617015362, "rewards//mean": 0.759033203125, "rewards//std": 0.033967550843954086, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2724, "grad_norm": 7.418876647949219, "kl": 0.8234708085656166, "learning_rate": 8.366822504152636e-07, "loss": 0.0823, "num_tokens": 11772776.0, "reward": 0.73114013671875, "reward_std": 0.0036419208627194166, "rewards//mean": 0.73114013671875, "rewards//std": 0.028743363916873932, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2726, "grad_norm": 3.678071975708008, "kl": 1.141478419303894, "learning_rate": 8.364475752563728e-07, "loss": 0.1141, "num_tokens": 11781408.0, "reward": 0.75323486328125, "reward_std": 0.008702388033270836, "rewards//mean": 0.75323486328125, "rewards//std": 0.034319959580898285, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2728, "grad_norm": 2.519693374633789, "kl": 0.5582467820495367, "learning_rate": 8.362127645764389e-07, "loss": 0.0558, "num_tokens": 11789976.0, "reward": 0.75274658203125, "reward_std": 0.0028330846689641476, "rewards//mean": 0.75274658203125, "rewards//std": 0.025052646175026894, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.273, "grad_norm": 4.423770427703857, "kl": 2.3881990388035774, "learning_rate": 8.359778184700439e-07, "loss": 0.2388, "num_tokens": 11798632.0, "reward": 0.79425048828125, "reward_std": 0.02025095745921135, "rewards//mean": 0.79425048828125, "rewards//std": 0.032324761152267456, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2732, "grad_norm": 3.8239078521728516, "kl": 1.0817086547613144, "learning_rate": 8.357427370318238e-07, "loss": 0.1082, "num_tokens": 11807256.0, "reward": 0.76885986328125, "reward_std": 0.007265827618539333, "rewards//mean": 0.76885986328125, "rewards//std": 0.025167187675833702, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2734, "grad_norm": 3.446352481842041, "kl": 0.7759816534817219, "learning_rate": 8.355075203564692e-07, "loss": 0.0776, "num_tokens": 11815920.0, "reward": 0.75262451171875, "reward_std": 0.00164124951697886, "rewards//mean": 0.75262451171875, "rewards//std": 0.028603991493582726, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2736, "grad_norm": 10.429774284362793, "kl": 1.9041251055896282, "learning_rate": 8.352721685387256e-07, "loss": 0.1904, "num_tokens": 11824560.0, "reward": 0.7603759765625, "reward_std": 0.011990025639533997, "rewards//mean": 0.7603759765625, "rewards//std": 0.033329401165246964, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2738, "grad_norm": 5.327267169952393, "kl": 1.3947375752031803, "learning_rate": 8.350366816733926e-07, "loss": 0.1395, "num_tokens": 11833184.0, "reward": 0.7496337890625, "reward_std": 0.011457724496722221, "rewards//mean": 0.7496337890625, "rewards//std": 0.03452516347169876, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.274, "grad_norm": 2.0751752853393555, "kl": 1.0512386709451675, "learning_rate": 8.348010598553243e-07, "loss": 0.1051, "num_tokens": 11841856.0, "reward": 0.75750732421875, "reward_std": 0.005020422860980034, "rewards//mean": 0.75750732421875, "rewards//std": 0.022939324378967285, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2742, "grad_norm": 7.620663642883301, "kl": 1.806717999279499, "learning_rate": 8.34565303179429e-07, "loss": 0.1807, "num_tokens": 11850632.0, "reward": 0.7593994140625, "reward_std": 0.008870774880051613, "rewards//mean": 0.7593994140625, "rewards//std": 0.0319984070956707, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2744, "grad_norm": 1.1963679790496826, "kl": 0.5940828789025545, "learning_rate": 8.343294117406698e-07, "loss": 0.0594, "num_tokens": 11859328.0, "reward": 0.76617431640625, "reward_std": 0.003053711960092187, "rewards//mean": 0.76617431640625, "rewards//std": 0.023155411705374718, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2746, "grad_norm": 7.2105231285095215, "kl": 1.6511837430298328, "learning_rate": 8.340933856340635e-07, "loss": 0.1651, "num_tokens": 11867920.0, "reward": 0.7418212890625, "reward_std": 0.0061455387622118, "rewards//mean": 0.7418212890625, "rewards//std": 0.033274855464696884, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2748, "grad_norm": 4.255744457244873, "kl": 1.6649334002286196, "learning_rate": 8.338572249546812e-07, "loss": 0.1665, "num_tokens": 11876616.0, "reward": 0.75311279296875, "reward_std": 0.006844652350991964, "rewards//mean": 0.75311279296875, "rewards//std": 0.035673823207616806, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.275, "grad_norm": 4.840819358825684, "kl": 1.4184761084616184, "learning_rate": 8.336209297976489e-07, "loss": 0.1418, "num_tokens": 11885248.0, "reward": 0.77081298828125, "reward_std": 0.009416550397872925, "rewards//mean": 0.77081298828125, "rewards//std": 0.020740758627653122, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2752, "grad_norm": 9.01152515411377, "kl": 2.583045953884721, "learning_rate": 8.333845002581458e-07, "loss": 0.2583, "num_tokens": 11893872.0, "reward": 0.75299072265625, "reward_std": 0.0110011612996459, "rewards//mean": 0.75299072265625, "rewards//std": 0.029860787093639374, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2754, "grad_norm": 3.243278980255127, "kl": 1.8015248626470566, "learning_rate": 8.331479364314059e-07, "loss": 0.1802, "num_tokens": 11902448.0, "reward": 0.75286865234375, "reward_std": 0.009883337654173374, "rewards//mean": 0.75286865234375, "rewards//std": 0.029341213405132294, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2756, "grad_norm": 5.122808933258057, "kl": 1.906004762277007, "learning_rate": 8.32911238412717e-07, "loss": 0.1906, "num_tokens": 11911064.0, "reward": 0.78759765625, "reward_std": 0.01040503941476345, "rewards//mean": 0.78759765625, "rewards//std": 0.029433997347950935, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2758, "grad_norm": 8.389034271240234, "kl": 0.8222418315708637, "learning_rate": 8.326744062974211e-07, "loss": 0.0822, "num_tokens": 11919760.0, "reward": 0.74853515625, "reward_std": 0.004016595426946878, "rewards//mean": 0.74853515625, "rewards//std": 0.02361765317618847, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.276, "grad_norm": 4.624485492706299, "kl": 1.3433728516101837, "learning_rate": 8.324374401809142e-07, "loss": 0.1343, "num_tokens": 11928344.0, "reward": 0.72930908203125, "reward_std": 0.012796587310731411, "rewards//mean": 0.72930908203125, "rewards//std": 0.04307110235095024, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2762, "grad_norm": 14.707901954650879, "kl": 0.9655712600797415, "learning_rate": 8.322003401586461e-07, "loss": 0.0966, "num_tokens": 11936976.0, "reward": 0.77117919921875, "reward_std": 0.0059419069439172745, "rewards//mean": 0.77117919921875, "rewards//std": 0.02969047613441944, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2764, "grad_norm": 2.6805195808410645, "kl": 1.6826000418514013, "learning_rate": 8.319631063261207e-07, "loss": 0.1683, "num_tokens": 11945576.0, "reward": 0.755615234375, "reward_std": 0.008507579565048218, "rewards//mean": 0.755615234375, "rewards//std": 0.025703487917780876, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2766, "grad_norm": 0.2917509973049164, "kl": 0.4393001478165388, "learning_rate": 8.317257387788958e-07, "loss": 0.0439, "num_tokens": 11954136.0, "reward": 0.74420166015625, "reward_std": 0.0009134745923802257, "rewards//mean": 0.74420166015625, "rewards//std": 0.022498900070786476, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2768, "grad_norm": 6.497793674468994, "kl": 1.4654463231563568, "learning_rate": 8.314882376125831e-07, "loss": 0.1465, "num_tokens": 11962824.0, "reward": 0.783447265625, "reward_std": 0.01338261365890503, "rewards//mean": 0.783447265625, "rewards//std": 0.03168009966611862, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.277, "grad_norm": 1.3848124742507935, "kl": 0.572774613276124, "learning_rate": 8.312506029228477e-07, "loss": 0.0573, "num_tokens": 11971504.0, "reward": 0.75177001953125, "reward_std": 0.003506066743284464, "rewards//mean": 0.75177001953125, "rewards//std": 0.029289059340953827, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2772, "grad_norm": 9.892672538757324, "kl": 2.5279021225869656, "learning_rate": 8.310128348054093e-07, "loss": 0.2528, "num_tokens": 11980152.0, "reward": 0.7596435546875, "reward_std": 0.00803013239055872, "rewards//mean": 0.7596435546875, "rewards//std": 0.0346529558300972, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2774, "grad_norm": 1.4608051776885986, "kl": 0.91611497849226, "learning_rate": 8.307749333560404e-07, "loss": 0.0916, "num_tokens": 11988744.0, "reward": 0.767333984375, "reward_std": 0.00652629230171442, "rewards//mean": 0.767333984375, "rewards//std": 0.021405315026640892, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2776, "grad_norm": 19.319339752197266, "kl": 0.8584948647767305, "learning_rate": 8.305368986705681e-07, "loss": 0.0858, "num_tokens": 11997312.0, "reward": 0.73516845703125, "reward_std": 0.00625626090914011, "rewards//mean": 0.73516845703125, "rewards//std": 0.034359633922576904, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2778, "grad_norm": 3.730419397354126, "kl": 1.312489127740264, "learning_rate": 8.302987308448723e-07, "loss": 0.1312, "num_tokens": 12005928.0, "reward": 0.759033203125, "reward_std": 0.010758569464087486, "rewards//mean": 0.759033203125, "rewards//std": 0.03963417932391167, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.278, "grad_norm": 2.4365758895874023, "kl": 0.5073908474296331, "learning_rate": 8.300604299748874e-07, "loss": 0.0507, "num_tokens": 12014688.0, "reward": 0.74853515625, "reward_std": 0.0016159163787961006, "rewards//mean": 0.74853515625, "rewards//std": 0.02691522240638733, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2782, "grad_norm": 13.980290412902832, "kl": 2.666330335661769, "learning_rate": 8.298219961566008e-07, "loss": 0.2666, "num_tokens": 12023352.0, "reward": 0.72003173828125, "reward_std": 0.015395499765872955, "rewards//mean": 0.72003173828125, "rewards//std": 0.03750009462237358, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2784, "grad_norm": 9.084924697875977, "kl": 1.347284598276019, "learning_rate": 8.295834294860534e-07, "loss": 0.1347, "num_tokens": 12031984.0, "reward": 0.75323486328125, "reward_std": 0.0059725199826061726, "rewards//mean": 0.75323486328125, "rewards//std": 0.028861092403531075, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2786, "grad_norm": 4.401847839355469, "kl": 1.36901949159801, "learning_rate": 8.293447300593402e-07, "loss": 0.1369, "num_tokens": 12040632.0, "reward": 0.754150390625, "reward_std": 0.01407884992659092, "rewards//mean": 0.754150390625, "rewards//std": 0.03121025487780571, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2788, "grad_norm": 6.017202377319336, "kl": 1.452460439875722, "learning_rate": 8.291058979726091e-07, "loss": 0.1452, "num_tokens": 12049192.0, "reward": 0.7608642578125, "reward_std": 0.006034743040800095, "rewards//mean": 0.7608642578125, "rewards//std": 0.030595744028687477, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.279, "grad_norm": 1.6311331987380981, "kl": 1.444918017834425, "learning_rate": 8.288669333220614e-07, "loss": 0.1445, "num_tokens": 12057776.0, "reward": 0.7515869140625, "reward_std": 0.010062210261821747, "rewards//mean": 0.7515869140625, "rewards//std": 0.021784896031022072, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2792, "grad_norm": 5.6906328201293945, "kl": 2.264435239136219, "learning_rate": 8.286278362039527e-07, "loss": 0.2264, "num_tokens": 12066448.0, "reward": 0.76422119140625, "reward_std": 0.011034558527171612, "rewards//mean": 0.76422119140625, "rewards//std": 0.028188716620206833, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2794, "grad_norm": 1.753432273864746, "kl": 1.2903249748051167, "learning_rate": 8.283886067145906e-07, "loss": 0.129, "num_tokens": 12075056.0, "reward": 0.7764892578125, "reward_std": 0.012123174034059048, "rewards//mean": 0.7764892578125, "rewards//std": 0.0373992957174778, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2796, "grad_norm": 4.2095818519592285, "kl": 1.2844493500888348, "learning_rate": 8.281492449503372e-07, "loss": 0.1284, "num_tokens": 12083688.0, "reward": 0.763427734375, "reward_std": 0.007913639768958092, "rewards//mean": 0.763427734375, "rewards//std": 0.025285478681325912, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2798, "grad_norm": 3.5186710357666016, "kl": 1.1074212044477463, "learning_rate": 8.279097510076069e-07, "loss": 0.1107, "num_tokens": 12092360.0, "reward": 0.7769775390625, "reward_std": 0.009716667234897614, "rewards//mean": 0.7769775390625, "rewards//std": 0.033347565680742264, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.28, "grad_norm": 1.393874168395996, "kl": 1.345550624653697, "learning_rate": 8.276701249828684e-07, "loss": 0.1346, "num_tokens": 12100960.0, "reward": 0.75579833984375, "reward_std": 0.009464503265917301, "rewards//mean": 0.75579833984375, "rewards//std": 0.03382548317313194, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2802, "grad_norm": 3.9176719188690186, "kl": 0.874677112326026, "learning_rate": 8.274303669726426e-07, "loss": 0.0875, "num_tokens": 12109584.0, "reward": 0.764404296875, "reward_std": 0.006336133927106857, "rewards//mean": 0.764404296875, "rewards//std": 0.03806307911872864, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2804, "grad_norm": 11.06923770904541, "kl": 1.2614341340959072, "learning_rate": 8.271904770735041e-07, "loss": 0.1261, "num_tokens": 12118208.0, "reward": 0.77410888671875, "reward_std": 0.010842102579772472, "rewards//mean": 0.77410888671875, "rewards//std": 0.029729198664426804, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2806, "grad_norm": 4.535281658172607, "kl": 1.4729410503059626, "learning_rate": 8.269504553820805e-07, "loss": 0.1473, "num_tokens": 12126832.0, "reward": 0.78204345703125, "reward_std": 0.008088983595371246, "rewards//mean": 0.78204345703125, "rewards//std": 0.026659158989787102, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2808, "grad_norm": 3.3165090084075928, "kl": 1.9795909393578768, "learning_rate": 8.267103019950528e-07, "loss": 0.198, "num_tokens": 12135432.0, "reward": 0.71868896484375, "reward_std": 0.010753355920314789, "rewards//mean": 0.71868896484375, "rewards//std": 0.03845316171646118, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.281, "grad_norm": 6.39655876159668, "kl": 1.7917762715369463, "learning_rate": 8.264700170091543e-07, "loss": 0.1792, "num_tokens": 12143992.0, "reward": 0.75311279296875, "reward_std": 0.011714443564414978, "rewards//mean": 0.75311279296875, "rewards//std": 0.02290630340576172, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2812, "grad_norm": 15.693798065185547, "kl": 2.989787459373474, "learning_rate": 8.262296005211721e-07, "loss": 0.299, "num_tokens": 12152592.0, "reward": 0.74774169921875, "reward_std": 0.009115074761211872, "rewards//mean": 0.74774169921875, "rewards//std": 0.03236033394932747, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2814, "grad_norm": 2.443939208984375, "kl": 1.4647480100393295, "learning_rate": 8.259890526279459e-07, "loss": 0.1465, "num_tokens": 12161296.0, "reward": 0.7900390625, "reward_std": 0.010983582586050034, "rewards//mean": 0.7900390625, "rewards//std": 0.02108754962682724, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2816, "grad_norm": 6.9362592697143555, "kl": 2.383898377418518, "learning_rate": 8.257483734263681e-07, "loss": 0.2384, "num_tokens": 12169976.0, "reward": 0.7171630859375, "reward_std": 0.01384001411497593, "rewards//mean": 0.7171630859375, "rewards//std": 0.04279489442706108, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2818, "grad_norm": 6.503877639770508, "kl": 0.9984986390918493, "learning_rate": 8.255075630133845e-07, "loss": 0.0998, "num_tokens": 12178552.0, "reward": 0.77008056640625, "reward_std": 0.00814627856016159, "rewards//mean": 0.77008056640625, "rewards//std": 0.019782207906246185, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.282, "grad_norm": 4.591920852661133, "kl": 1.442638648673892, "learning_rate": 8.252666214859934e-07, "loss": 0.1443, "num_tokens": 12187224.0, "reward": 0.73504638671875, "reward_std": 0.007913686335086823, "rewards//mean": 0.73504638671875, "rewards//std": 0.03350578248500824, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2822, "grad_norm": 2.359144687652588, "kl": 1.9792951606214046, "learning_rate": 8.250255489412462e-07, "loss": 0.1979, "num_tokens": 12195976.0, "reward": 0.75262451171875, "reward_std": 0.013961941003799438, "rewards//mean": 0.75262451171875, "rewards//std": 0.03549600765109062, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2824, "grad_norm": 5.347057342529297, "kl": 1.660464035347104, "learning_rate": 8.247843454762466e-07, "loss": 0.166, "num_tokens": 12204544.0, "reward": 0.76068115234375, "reward_std": 0.019290976226329803, "rewards//mean": 0.76068115234375, "rewards//std": 0.03252783417701721, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2826, "grad_norm": 4.361217021942139, "kl": 2.0865957494825125, "learning_rate": 8.245430111881517e-07, "loss": 0.2087, "num_tokens": 12213224.0, "reward": 0.775634765625, "reward_std": 0.02434913069009781, "rewards//mean": 0.775634765625, "rewards//std": 0.0416659414768219, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2828, "grad_norm": 6.953249454498291, "kl": 1.9759225770831108, "learning_rate": 8.243015461741706e-07, "loss": 0.1976, "num_tokens": 12221920.0, "reward": 0.7635498046875, "reward_std": 0.011011242866516113, "rewards//mean": 0.7635498046875, "rewards//std": 0.03638497740030289, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.283, "grad_norm": 3.491920232772827, "kl": 0.8142143860459328, "learning_rate": 8.240599505315654e-07, "loss": 0.0814, "num_tokens": 12230600.0, "reward": 0.764404296875, "reward_std": 0.005777326878160238, "rewards//mean": 0.764404296875, "rewards//std": 0.0344771184027195, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2832, "grad_norm": 4.880618095397949, "kl": 1.9639583434909582, "learning_rate": 8.238182243576511e-07, "loss": 0.1964, "num_tokens": 12239248.0, "reward": 0.7703857421875, "reward_std": 0.011037491261959076, "rewards//mean": 0.7703857421875, "rewards//std": 0.031300097703933716, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2834, "grad_norm": 3.7134809494018555, "kl": 1.022446770220995, "learning_rate": 8.235763677497945e-07, "loss": 0.1022, "num_tokens": 12247824.0, "reward": 0.7918701171875, "reward_std": 0.008282292634248734, "rewards//mean": 0.7918701171875, "rewards//std": 0.024094512686133385, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2836, "grad_norm": 5.692349910736084, "kl": 1.8521916195750237, "learning_rate": 8.233343808054157e-07, "loss": 0.1852, "num_tokens": 12256496.0, "reward": 0.75799560546875, "reward_std": 0.018615327775478363, "rewards//mean": 0.75799560546875, "rewards//std": 0.039850860834121704, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2838, "grad_norm": 9.783778190612793, "kl": 1.7258297987282276, "learning_rate": 8.23092263621987e-07, "loss": 0.1726, "num_tokens": 12265152.0, "reward": 0.7305908203125, "reward_std": 0.008400633931159973, "rewards//mean": 0.7305908203125, "rewards//std": 0.03428758308291435, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.284, "grad_norm": 3.2923576831817627, "kl": 1.6641437392681837, "learning_rate": 8.228500162970332e-07, "loss": 0.1664, "num_tokens": 12273816.0, "reward": 0.740478515625, "reward_std": 0.019815467298030853, "rewards//mean": 0.740478515625, "rewards//std": 0.050572752952575684, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2842, "grad_norm": 9.795512199401855, "kl": 1.4889667555689812, "learning_rate": 8.226076389281314e-07, "loss": 0.1489, "num_tokens": 12282528.0, "reward": 0.76580810546875, "reward_std": 0.014538135379552841, "rewards//mean": 0.76580810546875, "rewards//std": 0.03402137756347656, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2844, "grad_norm": 3.6462700366973877, "kl": 1.158252900466323, "learning_rate": 8.223651316129114e-07, "loss": 0.1158, "num_tokens": 12291168.0, "reward": 0.72418212890625, "reward_std": 0.007890328764915466, "rewards//mean": 0.72418212890625, "rewards//std": 0.02440561354160309, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2846, "grad_norm": 7.619853973388672, "kl": 0.9519822169095278, "learning_rate": 8.221224944490548e-07, "loss": 0.0952, "num_tokens": 12299824.0, "reward": 0.7584228515625, "reward_std": 0.010799422860145569, "rewards//mean": 0.7584228515625, "rewards//std": 0.034787241369485855, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2848, "grad_norm": 2.0467350482940674, "kl": 1.29447266086936, "learning_rate": 8.21879727534296e-07, "loss": 0.1294, "num_tokens": 12308464.0, "reward": 0.78607177734375, "reward_std": 0.00892403069883585, "rewards//mean": 0.78607177734375, "rewards//std": 0.03117816522717476, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.285, "grad_norm": 2.3626134395599365, "kl": 0.8906732350587845, "learning_rate": 8.216368309664213e-07, "loss": 0.0891, "num_tokens": 12316984.0, "reward": 0.77587890625, "reward_std": 0.005383210722357035, "rewards//mean": 0.77587890625, "rewards//std": 0.02964716963469982, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2852, "grad_norm": 0.7195881009101868, "kl": 0.7214030046015978, "learning_rate": 8.213938048432696e-07, "loss": 0.0721, "num_tokens": 12325648.0, "reward": 0.77227783203125, "reward_std": 0.005835712421685457, "rewards//mean": 0.77227783203125, "rewards//std": 0.02550298348069191, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2854, "grad_norm": 3.8188791275024414, "kl": 1.3568202015012503, "learning_rate": 8.211506492627318e-07, "loss": 0.1357, "num_tokens": 12334280.0, "reward": 0.73175048828125, "reward_std": 0.009329218417406082, "rewards//mean": 0.73175048828125, "rewards//std": 0.032556209713220596, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2856, "grad_norm": 4.030502796173096, "kl": 0.928346149623394, "learning_rate": 8.209073643227509e-07, "loss": 0.0928, "num_tokens": 12342960.0, "reward": 0.744140625, "reward_std": 0.009696818888187408, "rewards//mean": 0.744140625, "rewards//std": 0.04009667783975601, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2858, "grad_norm": 2.174384593963623, "kl": 0.8410797268152237, "learning_rate": 8.206639501213219e-07, "loss": 0.0841, "num_tokens": 12351680.0, "reward": 0.7952880859375, "reward_std": 0.005080194212496281, "rewards//mean": 0.7952880859375, "rewards//std": 0.023101497441530228, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.286, "grad_norm": 7.616950035095215, "kl": 1.8224603720009327, "learning_rate": 8.204204067564924e-07, "loss": 0.1822, "num_tokens": 12360304.0, "reward": 0.74810791015625, "reward_std": 0.010432298295199871, "rewards//mean": 0.74810791015625, "rewards//std": 0.023281892761588097, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2862, "grad_norm": 4.008780002593994, "kl": 1.6188208274543285, "learning_rate": 8.201767343263611e-07, "loss": 0.1619, "num_tokens": 12368984.0, "reward": 0.7171630859375, "reward_std": 0.009844319894909859, "rewards//mean": 0.7171630859375, "rewards//std": 0.03988299518823624, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2864, "grad_norm": 2.8394522666931152, "kl": 1.9309929125010967, "learning_rate": 8.199329329290796e-07, "loss": 0.1931, "num_tokens": 12377728.0, "reward": 0.76324462890625, "reward_std": 0.0168285071849823, "rewards//mean": 0.76324462890625, "rewards//std": 0.03365769237279892, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2866, "grad_norm": 3.6594748497009277, "kl": 0.829441886395216, "learning_rate": 8.19689002662851e-07, "loss": 0.0829, "num_tokens": 12386304.0, "reward": 0.72027587890625, "reward_std": 0.006811304483562708, "rewards//mean": 0.72027587890625, "rewards//std": 0.034573525190353394, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2868, "grad_norm": 8.208528518676758, "kl": 1.313731512054801, "learning_rate": 8.194449436259303e-07, "loss": 0.1314, "num_tokens": 12394880.0, "reward": 0.78411865234375, "reward_std": 0.006044465582817793, "rewards//mean": 0.78411865234375, "rewards//std": 0.024678243324160576, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.287, "grad_norm": 1.9686647653579712, "kl": 0.4446337874978781, "learning_rate": 8.192007559166247e-07, "loss": 0.0445, "num_tokens": 12403512.0, "reward": 0.75067138671875, "reward_std": 0.0007085598772391677, "rewards//mean": 0.75067138671875, "rewards//std": 0.02182750217616558, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2872, "grad_norm": 1.48588228225708, "kl": 1.269335813820362, "learning_rate": 8.189564396332926e-07, "loss": 0.1269, "num_tokens": 12412144.0, "reward": 0.77288818359375, "reward_std": 0.0065114363096654415, "rewards//mean": 0.77288818359375, "rewards//std": 0.02634557895362377, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2874, "grad_norm": 3.78623628616333, "kl": 1.1646060831844807, "learning_rate": 8.187119948743449e-07, "loss": 0.1165, "num_tokens": 12420840.0, "reward": 0.74029541015625, "reward_std": 0.00750130508095026, "rewards//mean": 0.74029541015625, "rewards//std": 0.02111041732132435, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2876, "grad_norm": 4.180212497711182, "kl": 0.7835429180413485, "learning_rate": 8.184674217382437e-07, "loss": 0.0784, "num_tokens": 12429520.0, "reward": 0.71539306640625, "reward_std": 0.007275192998349667, "rewards//mean": 0.71539306640625, "rewards//std": 0.036230869591236115, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2878, "grad_norm": 3.0557329654693604, "kl": 1.472244618460536, "learning_rate": 8.182227203235031e-07, "loss": 0.1472, "num_tokens": 12438240.0, "reward": 0.754638671875, "reward_std": 0.01097121275961399, "rewards//mean": 0.754638671875, "rewards//std": 0.032851118594408035, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.288, "grad_norm": 2.872533082962036, "kl": 1.2290428895503283, "learning_rate": 8.179778907286887e-07, "loss": 0.1229, "num_tokens": 12446904.0, "reward": 0.73321533203125, "reward_std": 0.008530453778803349, "rewards//mean": 0.73321533203125, "rewards//std": 0.02781081199645996, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2882, "grad_norm": 14.644976615905762, "kl": 2.908546209335327, "learning_rate": 8.177329330524181e-07, "loss": 0.2909, "num_tokens": 12455888.0, "reward": 0.6920166015625, "reward_std": 0.01498452853411436, "rewards//mean": 0.6920166015625, "rewards//std": 0.05833989009261131, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2884, "grad_norm": 3.194599151611328, "kl": 1.943106023594737, "learning_rate": 8.1748784739336e-07, "loss": 0.1943, "num_tokens": 12464520.0, "reward": 0.7371826171875, "reward_std": 0.006786532700061798, "rewards//mean": 0.7371826171875, "rewards//std": 0.02880997397005558, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2886, "grad_norm": 3.254049062728882, "kl": 0.7659182902425528, "learning_rate": 8.17242633850235e-07, "loss": 0.0766, "num_tokens": 12473136.0, "reward": 0.75555419921875, "reward_std": 0.00260446360334754, "rewards//mean": 0.75555419921875, "rewards//std": 0.020061077550053596, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2888, "grad_norm": 4.499659538269043, "kl": 1.9864756613969803, "learning_rate": 8.16997292521815e-07, "loss": 0.1986, "num_tokens": 12481776.0, "reward": 0.775390625, "reward_std": 0.010401003062725067, "rewards//mean": 0.775390625, "rewards//std": 0.03470904380083084, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.289, "grad_norm": 5.809593200683594, "kl": 1.9560941476374865, "learning_rate": 8.167518235069234e-07, "loss": 0.1956, "num_tokens": 12490408.0, "reward": 0.76617431640625, "reward_std": 0.01820463128387928, "rewards//mean": 0.76617431640625, "rewards//std": 0.03307150676846504, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2892, "grad_norm": 3.595529079437256, "kl": 1.4351292960345745, "learning_rate": 8.165062269044352e-07, "loss": 0.1435, "num_tokens": 12498952.0, "reward": 0.73309326171875, "reward_std": 0.010006649419665337, "rewards//mean": 0.73309326171875, "rewards//std": 0.030346043407917023, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2894, "grad_norm": 2.889712333679199, "kl": 1.7049615774303675, "learning_rate": 8.162605028132768e-07, "loss": 0.1705, "num_tokens": 12507640.0, "reward": 0.7738037109375, "reward_std": 0.010479219257831573, "rewards//mean": 0.7738037109375, "rewards//std": 0.025331832468509674, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2896, "grad_norm": 12.884332656860352, "kl": 2.1789142582565546, "learning_rate": 8.160146513324254e-07, "loss": 0.2179, "num_tokens": 12516336.0, "reward": 0.714599609375, "reward_std": 0.0071917022578418255, "rewards//mean": 0.714599609375, "rewards//std": 0.03260691091418266, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2898, "grad_norm": 3.4388458728790283, "kl": 1.781784588471055, "learning_rate": 8.157686725609105e-07, "loss": 0.1782, "num_tokens": 12524968.0, "reward": 0.7850341796875, "reward_std": 0.015044385567307472, "rewards//mean": 0.7850341796875, "rewards//std": 0.03369618207216263, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.29, "grad_norm": 10.754867553710938, "kl": 2.7751773670315742, "learning_rate": 8.155225665978118e-07, "loss": 0.2775, "num_tokens": 12533680.0, "reward": 0.740234375, "reward_std": 0.013198032043874264, "rewards//mean": 0.740234375, "rewards//std": 0.036959774792194366, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2902, "grad_norm": 4.8012261390686035, "kl": 0.6618184391409159, "learning_rate": 8.152763335422612e-07, "loss": 0.0662, "num_tokens": 12542256.0, "reward": 0.75982666015625, "reward_std": 0.006642960011959076, "rewards//mean": 0.75982666015625, "rewards//std": 0.028561091050505638, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2904, "grad_norm": 0.9024642109870911, "kl": 0.6103915609419346, "learning_rate": 8.150299734934412e-07, "loss": 0.061, "num_tokens": 12550904.0, "reward": 0.73675537109375, "reward_std": 0.0029347692616283894, "rewards//mean": 0.73675537109375, "rewards//std": 0.022995345294475555, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2906, "grad_norm": 8.933744430541992, "kl": 2.170983050018549, "learning_rate": 8.147834865505853e-07, "loss": 0.2171, "num_tokens": 12559496.0, "reward": 0.72320556640625, "reward_std": 0.009682497940957546, "rewards//mean": 0.72320556640625, "rewards//std": 0.04538314789533615, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2908, "grad_norm": 2.9089457988739014, "kl": 0.6553179547190666, "learning_rate": 8.145368728129789e-07, "loss": 0.0655, "num_tokens": 12568048.0, "reward": 0.74383544921875, "reward_std": 0.0035527851432561874, "rewards//mean": 0.74383544921875, "rewards//std": 0.025993304327130318, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.291, "grad_norm": 2.4311299324035645, "kl": 0.8503626752644777, "learning_rate": 8.142901323799577e-07, "loss": 0.085, "num_tokens": 12576680.0, "reward": 0.738525390625, "reward_std": 0.004661278799176216, "rewards//mean": 0.738525390625, "rewards//std": 0.03340674936771393, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2912, "grad_norm": 10.566218376159668, "kl": 1.4250258300453424, "learning_rate": 8.140432653509087e-07, "loss": 0.1425, "num_tokens": 12585336.0, "reward": 0.761474609375, "reward_std": 0.009308917447924614, "rewards//mean": 0.761474609375, "rewards//std": 0.03202228993177414, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2914, "grad_norm": 4.898598670959473, "kl": 1.3507120609283447, "learning_rate": 8.1379627182527e-07, "loss": 0.1351, "num_tokens": 12593952.0, "reward": 0.75079345703125, "reward_std": 0.004424169193953276, "rewards//mean": 0.75079345703125, "rewards//std": 0.03937855735421181, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2916, "grad_norm": 4.961860656738281, "kl": 1.54135200381279, "learning_rate": 8.135491519025306e-07, "loss": 0.1541, "num_tokens": 12602672.0, "reward": 0.7462158203125, "reward_std": 0.009684968739748001, "rewards//mean": 0.7462158203125, "rewards//std": 0.037782661616802216, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2918, "grad_norm": 3.2244813442230225, "kl": 1.0334641635417938, "learning_rate": 8.133019056822302e-07, "loss": 0.1033, "num_tokens": 12611336.0, "reward": 0.7684326171875, "reward_std": 0.00849007535725832, "rewards//mean": 0.7684326171875, "rewards//std": 0.030028430745005608, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.292, "grad_norm": 4.305205821990967, "kl": 1.560680564492941, "learning_rate": 8.130545332639597e-07, "loss": 0.1561, "num_tokens": 12619976.0, "reward": 0.73626708984375, "reward_std": 0.011509668081998825, "rewards//mean": 0.73626708984375, "rewards//std": 0.032196663320064545, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2922, "grad_norm": 5.1138505935668945, "kl": 1.9591032322496176, "learning_rate": 8.128070347473608e-07, "loss": 0.1959, "num_tokens": 12628720.0, "reward": 0.7720947265625, "reward_std": 0.012277388945221901, "rewards//mean": 0.7720947265625, "rewards//std": 0.031205160543322563, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2924, "grad_norm": 2.7395927906036377, "kl": 0.6407976988703012, "learning_rate": 8.125594102321255e-07, "loss": 0.0641, "num_tokens": 12637344.0, "reward": 0.782470703125, "reward_std": 0.0028777476400136948, "rewards//mean": 0.782470703125, "rewards//std": 0.028500672429800034, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2926, "grad_norm": 4.052397727966309, "kl": 0.8177272789180279, "learning_rate": 8.123116598179971e-07, "loss": 0.0818, "num_tokens": 12646096.0, "reward": 0.758544921875, "reward_std": 0.008035607635974884, "rewards//mean": 0.758544921875, "rewards//std": 0.037711478769779205, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2928, "grad_norm": 6.73102331161499, "kl": 1.5023040790110826, "learning_rate": 8.120637836047697e-07, "loss": 0.1502, "num_tokens": 12654704.0, "reward": 0.74102783203125, "reward_std": 0.01163367461413145, "rewards//mean": 0.74102783203125, "rewards//std": 0.03166331350803375, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.293, "grad_norm": 8.07838249206543, "kl": 1.9107473753392696, "learning_rate": 8.118157816922874e-07, "loss": 0.1911, "num_tokens": 12663376.0, "reward": 0.72015380859375, "reward_std": 0.010127536021173, "rewards//mean": 0.72015380859375, "rewards//std": 0.03241688758134842, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2932, "grad_norm": 7.672347545623779, "kl": 1.8996401652693748, "learning_rate": 8.115676541804455e-07, "loss": 0.19, "num_tokens": 12671992.0, "reward": 0.7203369140625, "reward_std": 0.01306104101240635, "rewards//mean": 0.7203369140625, "rewards//std": 0.05011012405157089, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2934, "grad_norm": 4.347347259521484, "kl": 1.485631575807929, "learning_rate": 8.113194011691899e-07, "loss": 0.1486, "num_tokens": 12680584.0, "reward": 0.74786376953125, "reward_std": 0.014387952163815498, "rewards//mean": 0.74786376953125, "rewards//std": 0.03491033613681793, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2936, "grad_norm": 1.2490336894989014, "kl": 0.9858845826238394, "learning_rate": 8.110710227585167e-07, "loss": 0.0986, "num_tokens": 12689168.0, "reward": 0.77215576171875, "reward_std": 0.008455757983028889, "rewards//mean": 0.77215576171875, "rewards//std": 0.023908693343400955, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2938, "grad_norm": 4.55409574508667, "kl": 1.7242893744260073, "learning_rate": 8.108225190484726e-07, "loss": 0.1724, "num_tokens": 12697840.0, "reward": 0.7286376953125, "reward_std": 0.014607956632971764, "rewards//mean": 0.7286376953125, "rewards//std": 0.04412269964814186, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.294, "grad_norm": 2.291954278945923, "kl": 2.1257865503430367, "learning_rate": 8.105738901391551e-07, "loss": 0.2126, "num_tokens": 12706584.0, "reward": 0.771240234375, "reward_std": 0.014790792018175125, "rewards//mean": 0.771240234375, "rewards//std": 0.035096779465675354, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2942, "grad_norm": 7.725774765014648, "kl": 0.7946789208799601, "learning_rate": 8.103251361307118e-07, "loss": 0.0795, "num_tokens": 12715280.0, "reward": 0.74493408203125, "reward_std": 0.0026413900777697563, "rewards//mean": 0.74493408203125, "rewards//std": 0.026727208867669106, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2944, "grad_norm": 1.7929853200912476, "kl": 1.6168434005230665, "learning_rate": 8.100762571233408e-07, "loss": 0.1617, "num_tokens": 12723888.0, "reward": 0.764892578125, "reward_std": 0.011940587311983109, "rewards//mean": 0.764892578125, "rewards//std": 0.030583124607801437, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2946, "grad_norm": 2.7661356925964355, "kl": 1.3101904802024364, "learning_rate": 8.098272532172905e-07, "loss": 0.131, "num_tokens": 12732704.0, "reward": 0.73382568359375, "reward_std": 0.011408751830458641, "rewards//mean": 0.73382568359375, "rewards//std": 0.03307882696390152, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2948, "grad_norm": 3.110581159591675, "kl": 1.0171023327857256, "learning_rate": 8.095781245128597e-07, "loss": 0.1017, "num_tokens": 12741424.0, "reward": 0.7440185546875, "reward_std": 0.007515724282711744, "rewards//mean": 0.7440185546875, "rewards//std": 0.03568596765398979, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.295, "grad_norm": 1.9205505847930908, "kl": 0.822709271684289, "learning_rate": 8.093288711103971e-07, "loss": 0.0823, "num_tokens": 12749984.0, "reward": 0.7606201171875, "reward_std": 0.006251954939216375, "rewards//mean": 0.7606201171875, "rewards//std": 0.027904614806175232, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2952, "grad_norm": 5.270324230194092, "kl": 1.0679902900010347, "learning_rate": 8.090794931103026e-07, "loss": 0.1068, "num_tokens": 12758592.0, "reward": 0.75225830078125, "reward_std": 0.006496988236904144, "rewards//mean": 0.75225830078125, "rewards//std": 0.024898696690797806, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2954, "grad_norm": 4.030917167663574, "kl": 1.2331567518413067, "learning_rate": 8.08829990613025e-07, "loss": 0.1233, "num_tokens": 12767192.0, "reward": 0.7545166015625, "reward_std": 0.013106499798595905, "rewards//mean": 0.7545166015625, "rewards//std": 0.03440921753644943, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2956, "grad_norm": 3.2291336059570312, "kl": 1.6347051300108433, "learning_rate": 8.085803637190643e-07, "loss": 0.1635, "num_tokens": 12775832.0, "reward": 0.786865234375, "reward_std": 0.01683385856449604, "rewards//mean": 0.786865234375, "rewards//std": 0.035172607749700546, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2958, "grad_norm": 5.795406818389893, "kl": 1.2767824828624725, "learning_rate": 8.083306125289697e-07, "loss": 0.1277, "num_tokens": 12784360.0, "reward": 0.755859375, "reward_std": 0.009569985792040825, "rewards//mean": 0.755859375, "rewards//std": 0.03326234221458435, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.296, "grad_norm": 8.213812828063965, "kl": 1.231348818168044, "learning_rate": 8.080807371433414e-07, "loss": 0.1231, "num_tokens": 12792968.0, "reward": 0.75750732421875, "reward_std": 0.010824731551110744, "rewards//mean": 0.75750732421875, "rewards//std": 0.027333160862326622, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2962, "grad_norm": 3.7849719524383545, "kl": 0.7953167650848627, "learning_rate": 8.07830737662829e-07, "loss": 0.0795, "num_tokens": 12801640.0, "reward": 0.7484130859375, "reward_std": 0.004670898430049419, "rewards//mean": 0.7484130859375, "rewards//std": 0.025626515969634056, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2964, "grad_norm": 5.114973068237305, "kl": 1.5109335407614708, "learning_rate": 8.075806141881325e-07, "loss": 0.1511, "num_tokens": 12810376.0, "reward": 0.76104736328125, "reward_std": 0.010529964230954647, "rewards//mean": 0.76104736328125, "rewards//std": 0.03358520567417145, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2966, "grad_norm": 3.346491575241089, "kl": 1.2512095645070076, "learning_rate": 8.073303668200011e-07, "loss": 0.1251, "num_tokens": 12819056.0, "reward": 0.77276611328125, "reward_std": 0.00834638625383377, "rewards//mean": 0.77276611328125, "rewards//std": 0.02868431806564331, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2968, "grad_norm": 11.776288032531738, "kl": 1.6909576747566462, "learning_rate": 8.070799956592349e-07, "loss": 0.1691, "num_tokens": 12827616.0, "reward": 0.73974609375, "reward_std": 0.012325368821620941, "rewards//mean": 0.73974609375, "rewards//std": 0.030887499451637268, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.297, "grad_norm": 3.3797404766082764, "kl": 0.9280413258820772, "learning_rate": 8.06829500806683e-07, "loss": 0.0928, "num_tokens": 12836248.0, "reward": 0.76702880859375, "reward_std": 0.008107181638479233, "rewards//mean": 0.76702880859375, "rewards//std": 0.026063675060868263, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2972, "grad_norm": 6.333701133728027, "kl": 3.41668620146811, "learning_rate": 8.06578882363245e-07, "loss": 0.3417, "num_tokens": 12844976.0, "reward": 0.7333984375, "reward_std": 0.02361653372645378, "rewards//mean": 0.7333984375, "rewards//std": 0.03670991212129593, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2974, "grad_norm": 5.154001235961914, "kl": 1.3662318922579288, "learning_rate": 8.063281404298699e-07, "loss": 0.1366, "num_tokens": 12853648.0, "reward": 0.78631591796875, "reward_std": 0.009496654383838177, "rewards//mean": 0.78631591796875, "rewards//std": 0.030818259343504906, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2976, "grad_norm": 2.8072831630706787, "kl": 1.3740430641919374, "learning_rate": 8.060772751075562e-07, "loss": 0.1374, "num_tokens": 12862296.0, "reward": 0.7535400390625, "reward_std": 0.006750519387423992, "rewards//mean": 0.7535400390625, "rewards//std": 0.022859087213873863, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2978, "grad_norm": 17.512250900268555, "kl": 1.33380495198071, "learning_rate": 8.058262864973528e-07, "loss": 0.1334, "num_tokens": 12870952.0, "reward": 0.74346923828125, "reward_std": 0.008682006038725376, "rewards//mean": 0.74346923828125, "rewards//std": 0.03712359815835953, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.298, "grad_norm": 5.1915717124938965, "kl": 1.159490229561925, "learning_rate": 8.055751747003579e-07, "loss": 0.1159, "num_tokens": 12879552.0, "reward": 0.7369384765625, "reward_std": 0.010565157048404217, "rewards//mean": 0.7369384765625, "rewards//std": 0.032288506627082825, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2982, "grad_norm": 20.961589813232422, "kl": 2.250743241980672, "learning_rate": 8.053239398177191e-07, "loss": 0.2251, "num_tokens": 12888144.0, "reward": 0.75775146484375, "reward_std": 0.01169489324092865, "rewards//mean": 0.75775146484375, "rewards//std": 0.03842165693640709, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2984, "grad_norm": 9.771661758422852, "kl": 1.4084699284285307, "learning_rate": 8.050725819506339e-07, "loss": 0.1408, "num_tokens": 12896776.0, "reward": 0.75518798828125, "reward_std": 0.012158473953604698, "rewards//mean": 0.75518798828125, "rewards//std": 0.029676197096705437, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2986, "grad_norm": 5.195116996765137, "kl": 2.023480501025915, "learning_rate": 8.048211012003489e-07, "loss": 0.2023, "num_tokens": 12905488.0, "reward": 0.74920654296875, "reward_std": 0.014508301392197609, "rewards//mean": 0.74920654296875, "rewards//std": 0.026659158989787102, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2988, "grad_norm": 3.7036025524139404, "kl": 1.2176567781716585, "learning_rate": 8.045694976681612e-07, "loss": 0.1218, "num_tokens": 12914184.0, "reward": 0.73602294921875, "reward_std": 0.010885203257203102, "rewards//mean": 0.73602294921875, "rewards//std": 0.03514456748962402, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.299, "grad_norm": 5.165862560272217, "kl": 2.218764767050743, "learning_rate": 8.043177714554159e-07, "loss": 0.2219, "num_tokens": 12922832.0, "reward": 0.76495361328125, "reward_std": 0.014915489591658115, "rewards//mean": 0.76495361328125, "rewards//std": 0.03475475311279297, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2992, "grad_norm": 5.861644268035889, "kl": 2.293422434478998, "learning_rate": 8.04065922663509e-07, "loss": 0.2293, "num_tokens": 12931536.0, "reward": 0.78033447265625, "reward_std": 0.01676148921251297, "rewards//mean": 0.78033447265625, "rewards//std": 0.04310518130660057, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2994, "grad_norm": 23.607648849487305, "kl": 3.8990252763032913, "learning_rate": 8.038139513938845e-07, "loss": 0.3899, "num_tokens": 12940248.0, "reward": 0.733154296875, "reward_std": 0.011588525958359241, "rewards//mean": 0.733154296875, "rewards//std": 0.03267369419336319, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2996, "grad_norm": 23.689645767211914, "kl": 3.6418681479990482, "learning_rate": 8.035618577480369e-07, "loss": 0.3642, "num_tokens": 12948888.0, "reward": 0.7723388671875, "reward_std": 0.017906051129102707, "rewards//mean": 0.7723388671875, "rewards//std": 0.03809865564107895, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.2998, "grad_norm": 9.937276840209961, "kl": 2.2664911299943924, "learning_rate": 8.033096418275092e-07, "loss": 0.2266, "num_tokens": 12957584.0, "reward": 0.72021484375, "reward_std": 0.011843969114124775, "rewards//mean": 0.72021484375, "rewards//std": 0.046201031655073166, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3, "grad_norm": 7.549541473388672, "kl": 1.1379733439534903, "learning_rate": 8.030573037338941e-07, "loss": 0.1138, "num_tokens": 12966240.0, "reward": 0.77044677734375, "reward_std": 0.01573409140110016, "rewards//mean": 0.77044677734375, "rewards//std": 0.02930404432117939, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3002, "grad_norm": 2.8214757442474365, "kl": 1.3537630531936884, "learning_rate": 8.028048435688333e-07, "loss": 0.1354, "num_tokens": 12974816.0, "reward": 0.77447509765625, "reward_std": 0.00893948134034872, "rewards//mean": 0.77447509765625, "rewards//std": 0.03006690926849842, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3004, "grad_norm": 4.832440376281738, "kl": 1.951517477631569, "learning_rate": 8.025522614340177e-07, "loss": 0.1952, "num_tokens": 12983344.0, "reward": 0.7366943359375, "reward_std": 0.018012333661317825, "rewards//mean": 0.7366943359375, "rewards//std": 0.04109729453921318, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3006, "grad_norm": 6.3301286697387695, "kl": 2.010582856833935, "learning_rate": 8.022995574311875e-07, "loss": 0.2011, "num_tokens": 12992024.0, "reward": 0.7222900390625, "reward_std": 0.010570096783339977, "rewards//mean": 0.7222900390625, "rewards//std": 0.03358638659119606, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3008, "grad_norm": 4.824504852294922, "kl": 1.5263384692370892, "learning_rate": 8.020467316621316e-07, "loss": 0.1526, "num_tokens": 13000600.0, "reward": 0.7816162109375, "reward_std": 0.009848502464592457, "rewards//mean": 0.7816162109375, "rewards//std": 0.02930385060608387, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.301, "grad_norm": 4.907270431518555, "kl": 1.3549659363925457, "learning_rate": 8.017937842286882e-07, "loss": 0.1355, "num_tokens": 13009272.0, "reward": 0.75518798828125, "reward_std": 0.012927262112498283, "rewards//mean": 0.75518798828125, "rewards//std": 0.02927355095744133, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3012, "grad_norm": 2.6145718097686768, "kl": 1.1812763661146164, "learning_rate": 8.015407152327447e-07, "loss": 0.1181, "num_tokens": 13017872.0, "reward": 0.74652099609375, "reward_std": 0.007757539860904217, "rewards//mean": 0.74652099609375, "rewards//std": 0.03364689648151398, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3014, "grad_norm": 3.4602737426757812, "kl": 1.3582522384822369, "learning_rate": 8.012875247762372e-07, "loss": 0.1358, "num_tokens": 13026464.0, "reward": 0.7747802734375, "reward_std": 0.009007740765810013, "rewards//mean": 0.7747802734375, "rewards//std": 0.02747381664812565, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3016, "grad_norm": 2.8289504051208496, "kl": 1.087059661746025, "learning_rate": 8.010342129611507e-07, "loss": 0.1087, "num_tokens": 13035080.0, "reward": 0.74578857421875, "reward_std": 0.0074275191873312, "rewards//mean": 0.74578857421875, "rewards//std": 0.027737779542803764, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3018, "grad_norm": 5.269764423370361, "kl": 1.1959709245711565, "learning_rate": 8.007807798895193e-07, "loss": 0.1196, "num_tokens": 13043824.0, "reward": 0.74530029296875, "reward_std": 0.012596560642123222, "rewards//mean": 0.74530029296875, "rewards//std": 0.03109600394964218, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.302, "grad_norm": 2.1235713958740234, "kl": 1.288216095417738, "learning_rate": 8.005272256634257e-07, "loss": 0.1288, "num_tokens": 13052464.0, "reward": 0.7431640625, "reward_std": 0.0078422911465168, "rewards//mean": 0.7431640625, "rewards//std": 0.03133514150977135, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3022, "grad_norm": 2.6874098777770996, "kl": 1.1431039813905954, "learning_rate": 8.002735503850015e-07, "loss": 0.1143, "num_tokens": 13061144.0, "reward": 0.7708740234375, "reward_std": 0.008286407217383385, "rewards//mean": 0.7708740234375, "rewards//std": 0.029158851131796837, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3024, "grad_norm": 4.7610697746276855, "kl": 1.9959817864000797, "learning_rate": 8.000197541564271e-07, "loss": 0.1996, "num_tokens": 13069808.0, "reward": 0.73052978515625, "reward_std": 0.01354941911995411, "rewards//mean": 0.73052978515625, "rewards//std": 0.038079727441072464, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3026, "grad_norm": 8.037347793579102, "kl": 1.512846115976572, "learning_rate": 7.997658370799316e-07, "loss": 0.1513, "num_tokens": 13078400.0, "reward": 0.761474609375, "reward_std": 0.010340271517634392, "rewards//mean": 0.761474609375, "rewards//std": 0.027751486748456955, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3028, "grad_norm": 4.4248046875, "kl": 1.079721711575985, "learning_rate": 7.995117992577928e-07, "loss": 0.108, "num_tokens": 13086976.0, "reward": 0.7510986328125, "reward_std": 0.008711807429790497, "rewards//mean": 0.7510986328125, "rewards//std": 0.023862190544605255, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.303, "grad_norm": 2.05684232711792, "kl": 0.7509970609098673, "learning_rate": 7.992576407923372e-07, "loss": 0.0751, "num_tokens": 13095616.0, "reward": 0.7593994140625, "reward_std": 0.005750302225351334, "rewards//mean": 0.7593994140625, "rewards//std": 0.029113130643963814, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3032, "grad_norm": 5.213685035705566, "kl": 1.4956692047417164, "learning_rate": 7.990033617859395e-07, "loss": 0.1496, "num_tokens": 13104176.0, "reward": 0.7369384765625, "reward_std": 0.008287956938147545, "rewards//mean": 0.7369384765625, "rewards//std": 0.026238271966576576, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3034, "grad_norm": 3.9885752201080322, "kl": 1.2843257393687963, "learning_rate": 7.987489623410235e-07, "loss": 0.1284, "num_tokens": 13112776.0, "reward": 0.74688720703125, "reward_std": 0.00606179004535079, "rewards//mean": 0.74688720703125, "rewards//std": 0.03626636788249016, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3036, "grad_norm": 4.215937614440918, "kl": 1.8588938284665346, "learning_rate": 7.984944425600613e-07, "loss": 0.1859, "num_tokens": 13121368.0, "reward": 0.76422119140625, "reward_std": 0.016526661813259125, "rewards//mean": 0.76422119140625, "rewards//std": 0.038420867174863815, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3038, "grad_norm": 3.2377138137817383, "kl": 1.2811475973576307, "learning_rate": 7.982398025455732e-07, "loss": 0.1281, "num_tokens": 13130136.0, "reward": 0.7803955078125, "reward_std": 0.00979221984744072, "rewards//mean": 0.7803955078125, "rewards//std": 0.02574673853814602, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.304, "grad_norm": 4.779201507568359, "kl": 1.1627158615738153, "learning_rate": 7.979850424001282e-07, "loss": 0.1163, "num_tokens": 13138768.0, "reward": 0.76025390625, "reward_std": 0.008882921189069748, "rewards//mean": 0.76025390625, "rewards//std": 0.026607505977153778, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3042, "grad_norm": 1.3904224634170532, "kl": 0.8005918823182583, "learning_rate": 7.97730162226344e-07, "loss": 0.0801, "num_tokens": 13147480.0, "reward": 0.74847412109375, "reward_std": 0.004718668758869171, "rewards//mean": 0.74847412109375, "rewards//std": 0.029835429042577744, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3044, "grad_norm": 5.867562770843506, "kl": 1.6294219363480806, "learning_rate": 7.974751621268858e-07, "loss": 0.1629, "num_tokens": 13156176.0, "reward": 0.7520751953125, "reward_std": 0.009769590571522713, "rewards//mean": 0.7520751953125, "rewards//std": 0.03724679350852966, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3046, "grad_norm": 18.40738868713379, "kl": 1.7692720592021942, "learning_rate": 7.972200422044682e-07, "loss": 0.1769, "num_tokens": 13164800.0, "reward": 0.74969482421875, "reward_std": 0.016118552535772324, "rewards//mean": 0.74969482421875, "rewards//std": 0.03148400038480759, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3048, "grad_norm": 13.267224311828613, "kl": 0.8517594467848539, "learning_rate": 7.969648025618529e-07, "loss": 0.0852, "num_tokens": 13173432.0, "reward": 0.7694091796875, "reward_std": 0.0037979367189109325, "rewards//mean": 0.7694091796875, "rewards//std": 0.028133686631917953, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.305, "grad_norm": 11.241990089416504, "kl": 1.5441111326217651, "learning_rate": 7.967094433018508e-07, "loss": 0.1544, "num_tokens": 13182008.0, "reward": 0.760498046875, "reward_std": 0.012233583256602287, "rewards//mean": 0.760498046875, "rewards//std": 0.023917002603411674, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3052, "grad_norm": 4.886778831481934, "kl": 0.9253472704440355, "learning_rate": 7.964539645273202e-07, "loss": 0.0925, "num_tokens": 13190560.0, "reward": 0.776123046875, "reward_std": 0.008092978969216347, "rewards//mean": 0.776123046875, "rewards//std": 0.0288217943161726, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3054, "grad_norm": 6.10654878616333, "kl": 1.8288121819496155, "learning_rate": 7.961983663411684e-07, "loss": 0.1829, "num_tokens": 13199200.0, "reward": 0.75701904296875, "reward_std": 0.011038584634661674, "rewards//mean": 0.75701904296875, "rewards//std": 0.03996465727686882, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3056, "grad_norm": 21.48021125793457, "kl": 2.005971573293209, "learning_rate": 7.959426488463499e-07, "loss": 0.2006, "num_tokens": 13207840.0, "reward": 0.739990234375, "reward_std": 0.012318646535277367, "rewards//mean": 0.739990234375, "rewards//std": 0.028055289760231972, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3058, "grad_norm": 14.893638610839844, "kl": 2.1409136690199375, "learning_rate": 7.956868121458677e-07, "loss": 0.2141, "num_tokens": 13216408.0, "reward": 0.73272705078125, "reward_std": 0.00900019146502018, "rewards//mean": 0.73272705078125, "rewards//std": 0.0260398518294096, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.306, "grad_norm": 15.753615379333496, "kl": 2.612980095669627, "learning_rate": 7.954308563427732e-07, "loss": 0.2613, "num_tokens": 13225024.0, "reward": 0.74664306640625, "reward_std": 0.014729749411344528, "rewards//mean": 0.74664306640625, "rewards//std": 0.03410980850458145, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3062, "grad_norm": 6.290655612945557, "kl": 1.0625749547034502, "learning_rate": 7.951747815401649e-07, "loss": 0.1063, "num_tokens": 13233720.0, "reward": 0.7789306640625, "reward_std": 0.005719395820051432, "rewards//mean": 0.7789306640625, "rewards//std": 0.03233722969889641, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3064, "grad_norm": 3.4960575103759766, "kl": 0.9276208523660898, "learning_rate": 7.949185878411899e-07, "loss": 0.0928, "num_tokens": 13242352.0, "reward": 0.78204345703125, "reward_std": 0.010589463636279106, "rewards//mean": 0.78204345703125, "rewards//std": 0.026722678914666176, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3066, "grad_norm": 4.718594551086426, "kl": 0.8793540969491005, "learning_rate": 7.946622753490432e-07, "loss": 0.0879, "num_tokens": 13250976.0, "reward": 0.7777099609375, "reward_std": 0.009698061272501945, "rewards//mean": 0.7777099609375, "rewards//std": 0.029811890795826912, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3068, "grad_norm": 3.5431320667266846, "kl": 1.7327657788991928, "learning_rate": 7.94405844166967e-07, "loss": 0.1733, "num_tokens": 13259616.0, "reward": 0.7159423828125, "reward_std": 0.012289916165173054, "rewards//mean": 0.7159423828125, "rewards//std": 0.04350076988339424, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.307, "grad_norm": 8.551236152648926, "kl": 2.487602587789297, "learning_rate": 7.941492943982521e-07, "loss": 0.2488, "num_tokens": 13268344.0, "reward": 0.7349853515625, "reward_std": 0.0126652205362916, "rewards//mean": 0.7349853515625, "rewards//std": 0.044675033539533615, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3072, "grad_norm": 4.800576686859131, "kl": 1.3567574676126242, "learning_rate": 7.938926261462365e-07, "loss": 0.1357, "num_tokens": 13276984.0, "reward": 0.73065185546875, "reward_std": 0.009367650374770164, "rewards//mean": 0.73065185546875, "rewards//std": 0.027281051501631737, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3074, "grad_norm": 2.5340986251831055, "kl": 1.4362577367573977, "learning_rate": 7.936358395143063e-07, "loss": 0.1436, "num_tokens": 13285624.0, "reward": 0.7435302734375, "reward_std": 0.007179467007517815, "rewards//mean": 0.7435302734375, "rewards//std": 0.024351980537176132, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3076, "grad_norm": 4.1041789054870605, "kl": 1.6592697482556105, "learning_rate": 7.93378934605895e-07, "loss": 0.1659, "num_tokens": 13294312.0, "reward": 0.75531005859375, "reward_std": 0.014915602281689644, "rewards//mean": 0.75531005859375, "rewards//std": 0.030870771035552025, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3078, "grad_norm": 2.6387155055999756, "kl": 0.9437056761234999, "learning_rate": 7.93121911524484e-07, "loss": 0.0944, "num_tokens": 13302888.0, "reward": 0.7305908203125, "reward_std": 0.005320197436958551, "rewards//mean": 0.7305908203125, "rewards//std": 0.031148837879300117, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.308, "grad_norm": 2.047175645828247, "kl": 0.7891005948185921, "learning_rate": 7.928647703736023e-07, "loss": 0.0789, "num_tokens": 13311616.0, "reward": 0.7376708984375, "reward_std": 0.0020976788364350796, "rewards//mean": 0.7376708984375, "rewards//std": 0.029250076040625572, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3082, "grad_norm": 6.142187118530273, "kl": 0.8477199338376522, "learning_rate": 7.926075112568258e-07, "loss": 0.0848, "num_tokens": 13320200.0, "reward": 0.759033203125, "reward_std": 0.012320725247263908, "rewards//mean": 0.759033203125, "rewards//std": 0.034074340015649796, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3084, "grad_norm": 3.6553497314453125, "kl": 0.8029363844543695, "learning_rate": 7.923501342777787e-07, "loss": 0.0803, "num_tokens": 13328760.0, "reward": 0.73828125, "reward_std": 0.007218477316200733, "rewards//mean": 0.73828125, "rewards//std": 0.03479268029332161, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3086, "grad_norm": 6.512855529785156, "kl": 0.8737721461802721, "learning_rate": 7.920926395401326e-07, "loss": 0.0874, "num_tokens": 13337336.0, "reward": 0.780029296875, "reward_std": 0.007003386504948139, "rewards//mean": 0.780029296875, "rewards//std": 0.02661092020571232, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3088, "grad_norm": 4.729723930358887, "kl": 2.118842177093029, "learning_rate": 7.918350271476063e-07, "loss": 0.2119, "num_tokens": 13346008.0, "reward": 0.7794189453125, "reward_std": 0.010929237119853497, "rewards//mean": 0.7794189453125, "rewards//std": 0.030016331002116203, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.309, "grad_norm": 2.64178466796875, "kl": 1.4418923668563366, "learning_rate": 7.915772972039659e-07, "loss": 0.1442, "num_tokens": 13354600.0, "reward": 0.75494384765625, "reward_std": 0.006001880392432213, "rewards//mean": 0.75494384765625, "rewards//std": 0.03073612041771412, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3092, "grad_norm": 5.51864767074585, "kl": 1.497476452961564, "learning_rate": 7.913194498130251e-07, "loss": 0.1497, "num_tokens": 13363288.0, "reward": 0.774658203125, "reward_std": 0.010132655501365662, "rewards//mean": 0.774658203125, "rewards//std": 0.035027701407670975, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3094, "grad_norm": 3.6369011402130127, "kl": 1.2751891389489174, "learning_rate": 7.910614850786447e-07, "loss": 0.1275, "num_tokens": 13371904.0, "reward": 0.748046875, "reward_std": 0.009027308784425259, "rewards//mean": 0.748046875, "rewards//std": 0.02498798444867134, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3096, "grad_norm": 1.8712557554244995, "kl": 1.1147124227136374, "learning_rate": 7.90803403104733e-07, "loss": 0.1115, "num_tokens": 13380552.0, "reward": 0.76202392578125, "reward_std": 0.0050063710659742355, "rewards//mean": 0.76202392578125, "rewards//std": 0.02744811400771141, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3098, "grad_norm": 3.3328089714050293, "kl": 1.0839637350291014, "learning_rate": 7.905452039952451e-07, "loss": 0.1084, "num_tokens": 13389160.0, "reward": 0.8018798828125, "reward_std": 0.011048915795981884, "rewards//mean": 0.8018798828125, "rewards//std": 0.02959785796701908, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.31, "grad_norm": 11.956218719482422, "kl": 2.716661686077714, "learning_rate": 7.90286887854184e-07, "loss": 0.2717, "num_tokens": 13397784.0, "reward": 0.737060546875, "reward_std": 0.010013382881879807, "rewards//mean": 0.737060546875, "rewards//std": 0.03005589358508587, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3102, "grad_norm": 2.4767353534698486, "kl": 1.0682494826614857, "learning_rate": 7.900284547855991e-07, "loss": 0.1068, "num_tokens": 13406464.0, "reward": 0.7271728515625, "reward_std": 0.008038777858018875, "rewards//mean": 0.7271728515625, "rewards//std": 0.036112699657678604, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3104, "grad_norm": 3.2068850994110107, "kl": 1.5400317627936602, "learning_rate": 7.897699048935873e-07, "loss": 0.154, "num_tokens": 13415112.0, "reward": 0.753662109375, "reward_std": 0.01680159568786621, "rewards//mean": 0.753662109375, "rewards//std": 0.035962529480457306, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3106, "grad_norm": 3.355475425720215, "kl": 1.2242168709635735, "learning_rate": 7.895112382822924e-07, "loss": 0.1224, "num_tokens": 13423688.0, "reward": 0.77093505859375, "reward_std": 0.01482023298740387, "rewards//mean": 0.77093505859375, "rewards//std": 0.03478827700018883, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3108, "grad_norm": 2.225855827331543, "kl": 1.0106054935604334, "learning_rate": 7.892524550559055e-07, "loss": 0.1011, "num_tokens": 13432320.0, "reward": 0.75189208984375, "reward_std": 0.008050731383264065, "rewards//mean": 0.75189208984375, "rewards//std": 0.022638414055109024, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.311, "grad_norm": 3.592600107192993, "kl": 0.8617877624928951, "learning_rate": 7.889935553186641e-07, "loss": 0.0862, "num_tokens": 13440992.0, "reward": 0.722412109375, "reward_std": 0.0037979367189109325, "rewards//mean": 0.722412109375, "rewards//std": 0.03946883976459503, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3112, "grad_norm": 6.422780990600586, "kl": 2.3515776824206114, "learning_rate": 7.887345391748532e-07, "loss": 0.2352, "num_tokens": 13449704.0, "reward": 0.7469482421875, "reward_std": 0.014347494579851627, "rewards//mean": 0.7469482421875, "rewards//std": 0.0360405296087265, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3114, "grad_norm": 2.1762797832489014, "kl": 1.141447415575385, "learning_rate": 7.884754067288046e-07, "loss": 0.1141, "num_tokens": 13458200.0, "reward": 0.74432373046875, "reward_std": 0.007982091046869755, "rewards//mean": 0.74432373046875, "rewards//std": 0.02768205665051937, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3116, "grad_norm": 1.4633209705352783, "kl": 1.4486240819096565, "learning_rate": 7.882161580848966e-07, "loss": 0.1449, "num_tokens": 13466832.0, "reward": 0.77484130859375, "reward_std": 0.009828522801399231, "rewards//mean": 0.77484130859375, "rewards//std": 0.03169150650501251, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3118, "grad_norm": 7.229136943817139, "kl": 1.8612057957798243, "learning_rate": 7.879567933475546e-07, "loss": 0.1861, "num_tokens": 13475464.0, "reward": 0.78656005859375, "reward_std": 0.0042387619614601135, "rewards//mean": 0.78656005859375, "rewards//std": 0.026533372700214386, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.312, "grad_norm": 1.9912046194076538, "kl": 1.2802764270454645, "learning_rate": 7.876973126212506e-07, "loss": 0.128, "num_tokens": 13484048.0, "reward": 0.75384521484375, "reward_std": 0.008768283762037754, "rewards//mean": 0.75384521484375, "rewards//std": 0.028709111735224724, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3122, "grad_norm": 2.2037763595581055, "kl": 2.05273468978703, "learning_rate": 7.874377160105036e-07, "loss": 0.2053, "num_tokens": 13492744.0, "reward": 0.74334716796875, "reward_std": 0.014118552207946777, "rewards//mean": 0.74334716796875, "rewards//std": 0.035445649176836014, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3124, "grad_norm": 2.6772477626800537, "kl": 1.9671489968895912, "learning_rate": 7.871780036198788e-07, "loss": 0.1967, "num_tokens": 13501400.0, "reward": 0.71099853515625, "reward_std": 0.009235531091690063, "rewards//mean": 0.71099853515625, "rewards//std": 0.03892269358038902, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3126, "grad_norm": 1.6730657815933228, "kl": 0.8839522656053305, "learning_rate": 7.869181755539887e-07, "loss": 0.0884, "num_tokens": 13509960.0, "reward": 0.776611328125, "reward_std": 0.004584170877933502, "rewards//mean": 0.776611328125, "rewards//std": 0.021775512024760246, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3128, "grad_norm": 1.6954586505889893, "kl": 1.7431340981274843, "learning_rate": 7.866582319174917e-07, "loss": 0.1743, "num_tokens": 13518672.0, "reward": 0.75811767578125, "reward_std": 0.01100950874388218, "rewards//mean": 0.75811767578125, "rewards//std": 0.03252829983830452, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.313, "grad_norm": 5.556436538696289, "kl": 2.1147956494241953, "learning_rate": 7.863981728150931e-07, "loss": 0.2115, "num_tokens": 13527360.0, "reward": 0.75927734375, "reward_std": 0.01640233024954796, "rewards//mean": 0.75927734375, "rewards//std": 0.04094248265028, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3132, "grad_norm": 5.08538293838501, "kl": 2.2543717678636312, "learning_rate": 7.861379983515448e-07, "loss": 0.2254, "num_tokens": 13536168.0, "reward": 0.74151611328125, "reward_std": 0.011906067840754986, "rewards//mean": 0.74151611328125, "rewards//std": 0.026617106050252914, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3134, "grad_norm": 4.156130790710449, "kl": 1.999925158917904, "learning_rate": 7.858777086316451e-07, "loss": 0.2, "num_tokens": 13544952.0, "reward": 0.7574462890625, "reward_std": 0.01224803738296032, "rewards//mean": 0.7574462890625, "rewards//std": 0.02960808575153351, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3136, "grad_norm": 1.113006591796875, "kl": 1.2617440987378359, "learning_rate": 7.856173037602382e-07, "loss": 0.1262, "num_tokens": 13553632.0, "reward": 0.7415771484375, "reward_std": 0.006727217696607113, "rewards//mean": 0.7415771484375, "rewards//std": 0.03345995023846626, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3138, "grad_norm": 2.3450334072113037, "kl": 1.4017070643603802, "learning_rate": 7.853567838422159e-07, "loss": 0.1402, "num_tokens": 13562288.0, "reward": 0.7579345703125, "reward_std": 0.006472185719758272, "rewards//mean": 0.7579345703125, "rewards//std": 0.025807810947299004, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.314, "grad_norm": 3.7492599487304688, "kl": 1.2129064369946718, "learning_rate": 7.850961489825149e-07, "loss": 0.1213, "num_tokens": 13570992.0, "reward": 0.71734619140625, "reward_std": 0.005895301233977079, "rewards//mean": 0.71734619140625, "rewards//std": 0.03116748295724392, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3142, "grad_norm": 3.8201379776000977, "kl": 1.356517480686307, "learning_rate": 7.848353992861194e-07, "loss": 0.1357, "num_tokens": 13579696.0, "reward": 0.7706298828125, "reward_std": 0.007019908633083105, "rewards//mean": 0.7706298828125, "rewards//std": 0.024104561656713486, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3144, "grad_norm": 6.372725963592529, "kl": 1.8210402820259333, "learning_rate": 7.84574534858059e-07, "loss": 0.1821, "num_tokens": 13588248.0, "reward": 0.78668212890625, "reward_std": 0.011142443865537643, "rewards//mean": 0.78668212890625, "rewards//std": 0.028173675760626793, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3146, "grad_norm": 2.6758177280426025, "kl": 1.2292321268469095, "learning_rate": 7.8431355580341e-07, "loss": 0.1229, "num_tokens": 13596944.0, "reward": 0.780517578125, "reward_std": 0.00958950724452734, "rewards//mean": 0.780517578125, "rewards//std": 0.02704427018761635, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3148, "grad_norm": 2.7407374382019043, "kl": 1.4006772879511118, "learning_rate": 7.840524622272948e-07, "loss": 0.1401, "num_tokens": 13605600.0, "reward": 0.7579345703125, "reward_std": 0.010624115355312824, "rewards//mean": 0.7579345703125, "rewards//std": 0.02942139469087124, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.315, "grad_norm": 1.3578262329101562, "kl": 1.2945702727884054, "learning_rate": 7.837912542348817e-07, "loss": 0.1295, "num_tokens": 13614224.0, "reward": 0.787841796875, "reward_std": 0.009543722495436668, "rewards//mean": 0.787841796875, "rewards//std": 0.02747078612446785, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3152, "grad_norm": 1.3978792428970337, "kl": 0.59761449880898, "learning_rate": 7.835299319313853e-07, "loss": 0.0598, "num_tokens": 13622760.0, "reward": 0.78985595703125, "reward_std": 0.0037059392780065536, "rewards//mean": 0.78985595703125, "rewards//std": 0.020566320046782494, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3154, "grad_norm": 4.199551582336426, "kl": 1.2743331119418144, "learning_rate": 7.832684954220663e-07, "loss": 0.1274, "num_tokens": 13631560.0, "reward": 0.7708740234375, "reward_std": 0.005090552382171154, "rewards//mean": 0.7708740234375, "rewards//std": 0.0252983458340168, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3156, "grad_norm": 1.912658929824829, "kl": 0.8259076457470655, "learning_rate": 7.830069448122312e-07, "loss": 0.0826, "num_tokens": 13640152.0, "reward": 0.77105712890625, "reward_std": 0.005859190598130226, "rewards//mean": 0.77105712890625, "rewards//std": 0.026771916076540947, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3158, "grad_norm": 4.865224838256836, "kl": 1.5082801021635532, "learning_rate": 7.827452802072327e-07, "loss": 0.1508, "num_tokens": 13648872.0, "reward": 0.7486572265625, "reward_std": 0.01078061480075121, "rewards//mean": 0.7486572265625, "rewards//std": 0.031041739508509636, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.316, "grad_norm": 2.9601898193359375, "kl": 1.4153399653732777, "learning_rate": 7.82483501712469e-07, "loss": 0.1415, "num_tokens": 13657488.0, "reward": 0.7210693359375, "reward_std": 0.007203842978924513, "rewards//mean": 0.7210693359375, "rewards//std": 0.03267994895577431, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3162, "grad_norm": 2.042904853820801, "kl": 1.1619634237140417, "learning_rate": 7.822216094333847e-07, "loss": 0.1162, "num_tokens": 13666072.0, "reward": 0.78118896484375, "reward_std": 0.005377811845391989, "rewards//mean": 0.78118896484375, "rewards//std": 0.024463849142193794, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3164, "grad_norm": 3.8128445148468018, "kl": 2.1430839393287897, "learning_rate": 7.819596034754696e-07, "loss": 0.2143, "num_tokens": 13674672.0, "reward": 0.772216796875, "reward_std": 0.020793776959180832, "rewards//mean": 0.772216796875, "rewards//std": 0.03858760744333267, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3166, "grad_norm": 4.484536170959473, "kl": 1.7213558480143547, "learning_rate": 7.816974839442603e-07, "loss": 0.1721, "num_tokens": 13683352.0, "reward": 0.75592041015625, "reward_std": 0.009616399183869362, "rewards//mean": 0.75592041015625, "rewards//std": 0.035207830369472504, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3168, "grad_norm": 3.307380437850952, "kl": 0.9095842018723488, "learning_rate": 7.814352509453379e-07, "loss": 0.091, "num_tokens": 13691944.0, "reward": 0.76568603515625, "reward_std": 0.006881332024931908, "rewards//mean": 0.76568603515625, "rewards//std": 0.03095010668039322, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.317, "grad_norm": 0.8180329203605652, "kl": 0.8523822519928217, "learning_rate": 7.811729045843301e-07, "loss": 0.0852, "num_tokens": 13700544.0, "reward": 0.752197265625, "reward_std": 0.005305876489728689, "rewards//mean": 0.752197265625, "rewards//std": 0.03477788344025612, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3172, "grad_norm": 1.9212833642959595, "kl": 0.7617413811385632, "learning_rate": 7.8091044496691e-07, "loss": 0.0762, "num_tokens": 13709176.0, "reward": 0.74871826171875, "reward_std": 0.004841688089072704, "rewards//mean": 0.74871826171875, "rewards//std": 0.03266019746661186, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3174, "grad_norm": 13.169044494628906, "kl": 2.1142956260591745, "learning_rate": 7.806478721987963e-07, "loss": 0.2114, "num_tokens": 13717928.0, "reward": 0.75714111328125, "reward_std": 0.007307147607207298, "rewards//mean": 0.75714111328125, "rewards//std": 0.03143203258514404, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3176, "grad_norm": 2.6408851146698, "kl": 1.6814204212278128, "learning_rate": 7.803851863857532e-07, "loss": 0.1681, "num_tokens": 13726672.0, "reward": 0.7545166015625, "reward_std": 0.014688249677419662, "rewards//mean": 0.7545166015625, "rewards//std": 0.03536039963364601, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3178, "grad_norm": 4.11833381652832, "kl": 0.7404515333473682, "learning_rate": 7.801223876335907e-07, "loss": 0.074, "num_tokens": 13735360.0, "reward": 0.75146484375, "reward_std": 0.004385187290608883, "rewards//mean": 0.75146484375, "rewards//std": 0.022146357223391533, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.318, "grad_norm": 3.3639450073242188, "kl": 1.2758905328810215, "learning_rate": 7.798594760481637e-07, "loss": 0.1276, "num_tokens": 13743944.0, "reward": 0.744384765625, "reward_std": 0.01257241703569889, "rewards//mean": 0.744384765625, "rewards//std": 0.03544698655605316, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3182, "grad_norm": 2.480842351913452, "kl": 0.7207079511135817, "learning_rate": 7.795964517353733e-07, "loss": 0.0721, "num_tokens": 13752664.0, "reward": 0.75469970703125, "reward_std": 0.005162329412996769, "rewards//mean": 0.75469970703125, "rewards//std": 0.02229546196758747, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3184, "grad_norm": 2.6617345809936523, "kl": 0.6747199799865484, "learning_rate": 7.793333148011657e-07, "loss": 0.0675, "num_tokens": 13761248.0, "reward": 0.75189208984375, "reward_std": 0.003001078264787793, "rewards//mean": 0.75189208984375, "rewards//std": 0.03142769634723663, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3186, "grad_norm": 3.590163230895996, "kl": 1.6595496125519276, "learning_rate": 7.790700653515323e-07, "loss": 0.166, "num_tokens": 13769792.0, "reward": 0.70941162109375, "reward_std": 0.018709056079387665, "rewards//mean": 0.70941162109375, "rewards//std": 0.047458428889513016, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3188, "grad_norm": 1.8195044994354248, "kl": 0.7813002169132233, "learning_rate": 7.788067034925099e-07, "loss": 0.0781, "num_tokens": 13778368.0, "reward": 0.76263427734375, "reward_std": 0.004283786751329899, "rewards//mean": 0.76263427734375, "rewards//std": 0.024956386536359787, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.319, "grad_norm": 1.3556106090545654, "kl": 0.5726229008287191, "learning_rate": 7.785432293301806e-07, "loss": 0.0573, "num_tokens": 13786984.0, "reward": 0.7423095703125, "reward_std": 0.0024699352215975523, "rewards//mean": 0.7423095703125, "rewards//std": 0.029601948335766792, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3192, "grad_norm": 2.3305325508117676, "kl": 1.0940890200436115, "learning_rate": 7.78279642970672e-07, "loss": 0.1094, "num_tokens": 13795544.0, "reward": 0.766357421875, "reward_std": 0.007712875958532095, "rewards//mean": 0.766357421875, "rewards//std": 0.028321649879217148, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3194, "grad_norm": 2.3872010707855225, "kl": 1.9481038190424442, "learning_rate": 7.780159445201562e-07, "loss": 0.1948, "num_tokens": 13804232.0, "reward": 0.7818603515625, "reward_std": 0.013373659923672676, "rewards//mean": 0.7818603515625, "rewards//std": 0.029406985267996788, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3196, "grad_norm": 5.151343822479248, "kl": 1.7479426227509975, "learning_rate": 7.777521340848514e-07, "loss": 0.1748, "num_tokens": 13812880.0, "reward": 0.70086669921875, "reward_std": 0.010062005370855331, "rewards//mean": 0.70086669921875, "rewards//std": 0.03129495680332184, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3198, "grad_norm": 5.408864498138428, "kl": 1.326032130047679, "learning_rate": 7.774882117710202e-07, "loss": 0.1326, "num_tokens": 13821464.0, "reward": 0.74822998046875, "reward_std": 0.00762249156832695, "rewards//mean": 0.74822998046875, "rewards//std": 0.029031576588749886, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.32, "grad_norm": 8.4829740524292, "kl": 1.9540950190275908, "learning_rate": 7.772241776849704e-07, "loss": 0.1954, "num_tokens": 13830008.0, "reward": 0.69781494140625, "reward_std": 0.0062002502381801605, "rewards//mean": 0.69781494140625, "rewards//std": 0.01987382210791111, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3202, "grad_norm": 4.335939884185791, "kl": 1.7251414489001036, "learning_rate": 7.769600319330552e-07, "loss": 0.1725, "num_tokens": 13838680.0, "reward": 0.772216796875, "reward_std": 0.013628230430185795, "rewards//mean": 0.772216796875, "rewards//std": 0.034840505570173264, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3204, "grad_norm": 3.052686929702759, "kl": 1.5815194714814425, "learning_rate": 7.76695774621672e-07, "loss": 0.1582, "num_tokens": 13847352.0, "reward": 0.751220703125, "reward_std": 0.00982726737856865, "rewards//mean": 0.751220703125, "rewards//std": 0.02717827446758747, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3206, "grad_norm": 5.098186492919922, "kl": 1.0441518649458885, "learning_rate": 7.764314058572639e-07, "loss": 0.1044, "num_tokens": 13855992.0, "reward": 0.75897216796875, "reward_std": 0.006328641436994076, "rewards//mean": 0.75897216796875, "rewards//std": 0.022549305111169815, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3208, "grad_norm": 2.5886666774749756, "kl": 1.1139055788516998, "learning_rate": 7.761669257463187e-07, "loss": 0.1114, "num_tokens": 13864784.0, "reward": 0.77484130859375, "reward_std": 0.007363666780292988, "rewards//mean": 0.77484130859375, "rewards//std": 0.016810311004519463, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.321, "grad_norm": 2.864546298980713, "kl": 1.4520734641700983, "learning_rate": 7.759023343953688e-07, "loss": 0.1452, "num_tokens": 13873328.0, "reward": 0.7550048828125, "reward_std": 0.00650212075561285, "rewards//mean": 0.7550048828125, "rewards//std": 0.02677057310938835, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3212, "grad_norm": 5.175642490386963, "kl": 1.1859997715801, "learning_rate": 7.756376319109916e-07, "loss": 0.1186, "num_tokens": 13882000.0, "reward": 0.744384765625, "reward_std": 0.009795076213777065, "rewards//mean": 0.744384765625, "rewards//std": 0.02811565436422825, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3214, "grad_norm": 2.274979591369629, "kl": 1.1782709881663322, "learning_rate": 7.753728183998092e-07, "loss": 0.1178, "num_tokens": 13890640.0, "reward": 0.752685546875, "reward_std": 0.009832650423049927, "rewards//mean": 0.752685546875, "rewards//std": 0.03325415030121803, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3216, "grad_norm": 6.960323810577393, "kl": 2.113839268684387, "learning_rate": 7.751078939684885e-07, "loss": 0.2114, "num_tokens": 13899208.0, "reward": 0.75244140625, "reward_std": 0.01238178089261055, "rewards//mean": 0.75244140625, "rewards//std": 0.03176885098218918, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3218, "grad_norm": 3.728010892868042, "kl": 0.8760987985879183, "learning_rate": 7.748428587237411e-07, "loss": 0.0876, "num_tokens": 13907808.0, "reward": 0.75177001953125, "reward_std": 0.00822415016591549, "rewards//mean": 0.75177001953125, "rewards//std": 0.02808164805173874, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.322, "grad_norm": 3.4108834266662598, "kl": 1.701963922008872, "learning_rate": 7.74577712772323e-07, "loss": 0.1702, "num_tokens": 13916416.0, "reward": 0.75103759765625, "reward_std": 0.00944104976952076, "rewards//mean": 0.75103759765625, "rewards//std": 0.017023282125592232, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3222, "grad_norm": 1.2875810861587524, "kl": 1.2748398445546627, "learning_rate": 7.743124562210351e-07, "loss": 0.1275, "num_tokens": 13925032.0, "reward": 0.71661376953125, "reward_std": 0.0060372017323970795, "rewards//mean": 0.71661376953125, "rewards//std": 0.025228464975953102, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3224, "grad_norm": 5.368699550628662, "kl": 1.4100149907171726, "learning_rate": 7.740470891767224e-07, "loss": 0.141, "num_tokens": 13933616.0, "reward": 0.74786376953125, "reward_std": 0.00679908599704504, "rewards//mean": 0.74786376953125, "rewards//std": 0.02963433973491192, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3226, "grad_norm": 4.926486492156982, "kl": 1.964484740048647, "learning_rate": 7.737816117462751e-07, "loss": 0.1964, "num_tokens": 13942152.0, "reward": 0.74755859375, "reward_std": 0.006185316480696201, "rewards//mean": 0.74755859375, "rewards//std": 0.028726045042276382, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3228, "grad_norm": 2.5669498443603516, "kl": 1.7307023089379072, "learning_rate": 7.735160240366274e-07, "loss": 0.1731, "num_tokens": 13950896.0, "reward": 0.7427978515625, "reward_std": 0.009273719973862171, "rewards//mean": 0.7427978515625, "rewards//std": 0.03390577435493469, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.323, "grad_norm": 6.909610748291016, "kl": 2.189407590776682, "learning_rate": 7.732503261547578e-07, "loss": 0.2189, "num_tokens": 13959576.0, "reward": 0.783935546875, "reward_std": 0.011407473124563694, "rewards//mean": 0.783935546875, "rewards//std": 0.031062457710504532, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3232, "grad_norm": 2.96000337600708, "kl": 2.104118162766099, "learning_rate": 7.729845182076895e-07, "loss": 0.2104, "num_tokens": 13968200.0, "reward": 0.78009033203125, "reward_std": 0.01756000518798828, "rewards//mean": 0.78009033203125, "rewards//std": 0.03320170193910599, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3234, "grad_norm": 10.360834121704102, "kl": 1.9920466039329767, "learning_rate": 7.7271860030249e-07, "loss": 0.1992, "num_tokens": 13976824.0, "reward": 0.7193603515625, "reward_std": 0.011293711140751839, "rewards//mean": 0.7193603515625, "rewards//std": 0.03919384628534317, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3236, "grad_norm": 8.590371131896973, "kl": 2.300953108817339, "learning_rate": 7.72452572546271e-07, "loss": 0.2301, "num_tokens": 13985536.0, "reward": 0.756103515625, "reward_std": 0.010012295097112656, "rewards//mean": 0.756103515625, "rewards//std": 0.037986643612384796, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3238, "grad_norm": 10.4971923828125, "kl": 3.008170459419489, "learning_rate": 7.721864350461882e-07, "loss": 0.3008, "num_tokens": 13994200.0, "reward": 0.77392578125, "reward_std": 0.01783689856529236, "rewards//mean": 0.77392578125, "rewards//std": 0.04572148621082306, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.324, "grad_norm": 5.43795919418335, "kl": 1.2357711791992188, "learning_rate": 7.71920187909442e-07, "loss": 0.1236, "num_tokens": 14002800.0, "reward": 0.76153564453125, "reward_std": 0.010449150577187538, "rewards//mean": 0.76153564453125, "rewards//std": 0.03423428535461426, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3242, "grad_norm": 4.845322132110596, "kl": 1.9339684657752514, "learning_rate": 7.716538312432765e-07, "loss": 0.1934, "num_tokens": 14011320.0, "reward": 0.735595703125, "reward_std": 0.014104663394391537, "rewards//mean": 0.735595703125, "rewards//std": 0.03157288581132889, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3244, "grad_norm": 3.489070415496826, "kl": 1.914293834939599, "learning_rate": 7.713873651549804e-07, "loss": 0.1914, "num_tokens": 14019920.0, "reward": 0.746337890625, "reward_std": 0.013487953692674637, "rewards//mean": 0.746337890625, "rewards//std": 0.03299090638756752, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3246, "grad_norm": 4.1307878494262695, "kl": 1.3855783771723509, "learning_rate": 7.71120789751886e-07, "loss": 0.1386, "num_tokens": 14028488.0, "reward": 0.7540283203125, "reward_std": 0.012606613337993622, "rewards//mean": 0.7540283203125, "rewards//std": 0.031141061335802078, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3248, "grad_norm": 2.5597338676452637, "kl": 1.7195310927927494, "learning_rate": 7.7085410514137e-07, "loss": 0.172, "num_tokens": 14037240.0, "reward": 0.74755859375, "reward_std": 0.013452245853841305, "rewards//mean": 0.74755859375, "rewards//std": 0.0396929532289505, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.325, "grad_norm": 2.0841336250305176, "kl": 1.7714228797703981, "learning_rate": 7.705873114308527e-07, "loss": 0.1771, "num_tokens": 14045912.0, "reward": 0.75604248046875, "reward_std": 0.012261072173714638, "rewards//mean": 0.75604248046875, "rewards//std": 0.03111741691827774, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3252, "grad_norm": 3.2604620456695557, "kl": 1.9581709802150726, "learning_rate": 7.703204087277988e-07, "loss": 0.1958, "num_tokens": 14054608.0, "reward": 0.75244140625, "reward_std": 0.011977639980614185, "rewards//mean": 0.75244140625, "rewards//std": 0.026009095832705498, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3254, "grad_norm": 4.069620609283447, "kl": 1.4125154428184032, "learning_rate": 7.700533971397165e-07, "loss": 0.1413, "num_tokens": 14063248.0, "reward": 0.7188720703125, "reward_std": 0.009850779548287392, "rewards//mean": 0.7188720703125, "rewards//std": 0.03520766645669937, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3256, "grad_norm": 13.517831802368164, "kl": 1.311868930235505, "learning_rate": 7.697862767741583e-07, "loss": 0.1312, "num_tokens": 14071864.0, "reward": 0.7684326171875, "reward_std": 0.01460731215775013, "rewards//mean": 0.7684326171875, "rewards//std": 0.034262850880622864, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3258, "grad_norm": 2.5224111080169678, "kl": 1.492361443117261, "learning_rate": 7.695190477387199e-07, "loss": 0.1492, "num_tokens": 14080496.0, "reward": 0.75189208984375, "reward_std": 0.01079997792840004, "rewards//mean": 0.75189208984375, "rewards//std": 0.029846590012311935, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.326, "grad_norm": 2.7099499702453613, "kl": 1.025027807801962, "learning_rate": 7.692517101410414e-07, "loss": 0.1025, "num_tokens": 14089224.0, "reward": 0.77435302734375, "reward_std": 0.00937599316239357, "rewards//mean": 0.77435302734375, "rewards//std": 0.026178419589996338, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3262, "grad_norm": 6.866759300231934, "kl": 1.465584084391594, "learning_rate": 7.689842640888063e-07, "loss": 0.1466, "num_tokens": 14097808.0, "reward": 0.75164794921875, "reward_std": 0.0056014652363955975, "rewards//mean": 0.75164794921875, "rewards//std": 0.02820965275168419, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3264, "grad_norm": 2.365687847137451, "kl": 1.803080828860402, "learning_rate": 7.687167096897418e-07, "loss": 0.1803, "num_tokens": 14106552.0, "reward": 0.7303466796875, "reward_std": 0.014007436111569405, "rewards//mean": 0.7303466796875, "rewards//std": 0.029338955879211426, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3266, "grad_norm": 2.817216157913208, "kl": 1.9862238634377718, "learning_rate": 7.684490470516185e-07, "loss": 0.1986, "num_tokens": 14115120.0, "reward": 0.76763916015625, "reward_std": 0.01972937025129795, "rewards//mean": 0.76763916015625, "rewards//std": 0.033414848148822784, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3268, "grad_norm": 5.252283096313477, "kl": 2.063719341531396, "learning_rate": 7.681812762822515e-07, "loss": 0.2064, "num_tokens": 14123784.0, "reward": 0.7532958984375, "reward_std": 0.011641591787338257, "rewards//mean": 0.7532958984375, "rewards//std": 0.03930491954088211, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.327, "grad_norm": 3.38424015045166, "kl": 0.7838151380419731, "learning_rate": 7.679133974894982e-07, "loss": 0.0784, "num_tokens": 14132368.0, "reward": 0.75506591796875, "reward_std": 0.0034234439954161644, "rewards//mean": 0.75506591796875, "rewards//std": 0.031208738684654236, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3272, "grad_norm": 4.37358283996582, "kl": 1.3049269057810307, "learning_rate": 7.676454107812607e-07, "loss": 0.1305, "num_tokens": 14140984.0, "reward": 0.78289794921875, "reward_std": 0.010955949313938618, "rewards//mean": 0.78289794921875, "rewards//std": 0.032591529190540314, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3274, "grad_norm": 10.67484188079834, "kl": 2.8740222696214914, "learning_rate": 7.673773162654836e-07, "loss": 0.2874, "num_tokens": 14149696.0, "reward": 0.7432861328125, "reward_std": 0.01724255643785, "rewards//mean": 0.7432861328125, "rewards//std": 0.039114974439144135, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3276, "grad_norm": 3.0229294300079346, "kl": 1.480850925669074, "learning_rate": 7.671091140501555e-07, "loss": 0.1481, "num_tokens": 14158304.0, "reward": 0.7454833984375, "reward_std": 0.008684443309903145, "rewards//mean": 0.7454833984375, "rewards//std": 0.023253023624420166, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3278, "grad_norm": 7.474429130554199, "kl": 1.0412239786237478, "learning_rate": 7.668408042433081e-07, "loss": 0.1041, "num_tokens": 14166912.0, "reward": 0.75970458984375, "reward_std": 0.013540440239012241, "rewards//mean": 0.75970458984375, "rewards//std": 0.03504233807325363, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.328, "grad_norm": 4.839244365692139, "kl": 1.6017267871648073, "learning_rate": 7.665723869530169e-07, "loss": 0.1602, "num_tokens": 14175480.0, "reward": 0.77215576171875, "reward_std": 0.01985972747206688, "rewards//mean": 0.77215576171875, "rewards//std": 0.03297019377350807, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3282, "grad_norm": 2.8047549724578857, "kl": 0.9350009337067604, "learning_rate": 7.663038622873999e-07, "loss": 0.0935, "num_tokens": 14184088.0, "reward": 0.74517822265625, "reward_std": 0.007546941749751568, "rewards//mean": 0.74517822265625, "rewards//std": 0.03274675831198692, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3284, "grad_norm": 3.22479510307312, "kl": 1.7214165013283491, "learning_rate": 7.660352303546192e-07, "loss": 0.1721, "num_tokens": 14192728.0, "reward": 0.71905517578125, "reward_std": 0.016738075762987137, "rewards//mean": 0.71905517578125, "rewards//std": 0.04340509697794914, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3286, "grad_norm": 9.772603034973145, "kl": 1.9997139070183039, "learning_rate": 7.657664912628794e-07, "loss": 0.2, "num_tokens": 14201512.0, "reward": 0.7371826171875, "reward_std": 0.007628243882209063, "rewards//mean": 0.7371826171875, "rewards//std": 0.03613448888063431, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3288, "grad_norm": 2.8274033069610596, "kl": 1.032218774780631, "learning_rate": 7.654976451204287e-07, "loss": 0.1032, "num_tokens": 14210072.0, "reward": 0.733642578125, "reward_std": 0.0062283845618367195, "rewards//mean": 0.733642578125, "rewards//std": 0.032732944935560226, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.329, "grad_norm": 4.682992935180664, "kl": 1.2679704055190086, "learning_rate": 7.652286920355583e-07, "loss": 0.1268, "num_tokens": 14218784.0, "reward": 0.7537841796875, "reward_std": 0.012609190307557583, "rewards//mean": 0.7537841796875, "rewards//std": 0.034694869071245193, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3292, "grad_norm": 6.466736793518066, "kl": 1.5422461535781622, "learning_rate": 7.649596321166024e-07, "loss": 0.1542, "num_tokens": 14227544.0, "reward": 0.76214599609375, "reward_std": 0.007505156099796295, "rewards//mean": 0.76214599609375, "rewards//std": 0.030290622264146805, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3294, "grad_norm": 6.936708927154541, "kl": 2.0178975593298674, "learning_rate": 7.646904654719385e-07, "loss": 0.2018, "num_tokens": 14236200.0, "reward": 0.75567626953125, "reward_std": 0.013023676350712776, "rewards//mean": 0.75567626953125, "rewards//std": 0.03307928517460823, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3296, "grad_norm": 3.4528238773345947, "kl": 1.3103303592652082, "learning_rate": 7.644211922099867e-07, "loss": 0.131, "num_tokens": 14244816.0, "reward": 0.74505615234375, "reward_std": 0.009770406410098076, "rewards//mean": 0.74505615234375, "rewards//std": 0.024678243324160576, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3298, "grad_norm": 3.8525946140289307, "kl": 0.5916859656572342, "learning_rate": 7.641518124392103e-07, "loss": 0.0592, "num_tokens": 14253456.0, "reward": 0.7496337890625, "reward_std": 0.0044884709641337395, "rewards//mean": 0.7496337890625, "rewards//std": 0.022474413737654686, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.33, "grad_norm": 3.8584213256835938, "kl": 1.8569832909852266, "learning_rate": 7.638823262681154e-07, "loss": 0.1857, "num_tokens": 14262040.0, "reward": 0.77972412109375, "reward_std": 0.01577387936413288, "rewards//mean": 0.77972412109375, "rewards//std": 0.038378290832042694, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3302, "grad_norm": 4.530097961425781, "kl": 0.9330780766904354, "learning_rate": 7.636127338052511e-07, "loss": 0.0933, "num_tokens": 14270656.0, "reward": 0.75543212890625, "reward_std": 0.007381060626357794, "rewards//mean": 0.75543212890625, "rewards//std": 0.030150368809700012, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3304, "grad_norm": 4.290153503417969, "kl": 1.5832807626575232, "learning_rate": 7.633430351592093e-07, "loss": 0.1583, "num_tokens": 14279280.0, "reward": 0.75518798828125, "reward_std": 0.013890420086681843, "rewards//mean": 0.75518798828125, "rewards//std": 0.03930390998721123, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3306, "grad_norm": 4.185068607330322, "kl": 1.0172320175915956, "learning_rate": 7.630732304386243e-07, "loss": 0.1017, "num_tokens": 14287896.0, "reward": 0.77313232421875, "reward_std": 0.012001155875623226, "rewards//mean": 0.77313232421875, "rewards//std": 0.029530974105000496, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3308, "grad_norm": 2.844931125640869, "kl": 2.5025689974427223, "learning_rate": 7.628033197521735e-07, "loss": 0.2503, "num_tokens": 14296560.0, "reward": 0.7325439453125, "reward_std": 0.010810887441039085, "rewards//mean": 0.7325439453125, "rewards//std": 0.025477223098278046, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.331, "grad_norm": 3.049485921859741, "kl": 1.1164589691907167, "learning_rate": 7.625333032085769e-07, "loss": 0.1116, "num_tokens": 14305192.0, "reward": 0.73834228515625, "reward_std": 0.005308812949806452, "rewards//mean": 0.73834228515625, "rewards//std": 0.02388272061944008, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3312, "grad_norm": 3.390105962753296, "kl": 2.4483199659734964, "learning_rate": 7.622631809165972e-07, "loss": 0.2448, "num_tokens": 14313848.0, "reward": 0.75872802734375, "reward_std": 0.0175870880484581, "rewards//mean": 0.75872802734375, "rewards//std": 0.035954900085926056, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3314, "grad_norm": 3.059648275375366, "kl": 0.9026069082319736, "learning_rate": 7.619929529850396e-07, "loss": 0.0903, "num_tokens": 14322432.0, "reward": 0.769287109375, "reward_std": 0.010657286271452904, "rewards//mean": 0.769287109375, "rewards//std": 0.030843360349535942, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3316, "grad_norm": 1.5156716108322144, "kl": 1.1438779421150684, "learning_rate": 7.617226195227517e-07, "loss": 0.1144, "num_tokens": 14331064.0, "reward": 0.77337646484375, "reward_std": 0.008215641602873802, "rewards//mean": 0.77337646484375, "rewards//std": 0.02871754765510559, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3318, "grad_norm": 4.749110698699951, "kl": 1.5000360701233149, "learning_rate": 7.614521806386243e-07, "loss": 0.15, "num_tokens": 14339672.0, "reward": 0.75054931640625, "reward_std": 0.010491067543625832, "rewards//mean": 0.75054931640625, "rewards//std": 0.0165416169911623, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.332, "grad_norm": 4.983676433563232, "kl": 1.2771404217928648, "learning_rate": 7.611816364415895e-07, "loss": 0.1277, "num_tokens": 14348360.0, "reward": 0.74749755859375, "reward_std": 0.011189857497811317, "rewards//mean": 0.74749755859375, "rewards//std": 0.04224032908678055, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3322, "grad_norm": 3.840453863143921, "kl": 0.8378145918250084, "learning_rate": 7.60910987040623e-07, "loss": 0.0838, "num_tokens": 14356952.0, "reward": 0.7608642578125, "reward_std": 0.006858887150883675, "rewards//mean": 0.7608642578125, "rewards//std": 0.025079594925045967, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3324, "grad_norm": 11.438995361328125, "kl": 2.391136320307851, "learning_rate": 7.606402325447419e-07, "loss": 0.2391, "num_tokens": 14365712.0, "reward": 0.7235107421875, "reward_std": 0.012360994704067707, "rewards//mean": 0.7235107421875, "rewards//std": 0.036567576229572296, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3326, "grad_norm": 4.214688301086426, "kl": 1.569329358637333, "learning_rate": 7.603693730630066e-07, "loss": 0.1569, "num_tokens": 14374352.0, "reward": 0.7423095703125, "reward_std": 0.008905535563826561, "rewards//mean": 0.7423095703125, "rewards//std": 0.03470184653997421, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3328, "grad_norm": 3.899890899658203, "kl": 1.8594390582293272, "learning_rate": 7.600984087045186e-07, "loss": 0.1859, "num_tokens": 14383008.0, "reward": 0.74951171875, "reward_std": 0.009215177968144417, "rewards//mean": 0.74951171875, "rewards//std": 0.03464968129992485, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.333, "grad_norm": 1.095352053642273, "kl": 0.7407671269029379, "learning_rate": 7.598273395784229e-07, "loss": 0.0741, "num_tokens": 14391616.0, "reward": 0.79248046875, "reward_std": 0.0021278527565300465, "rewards//mean": 0.79248046875, "rewards//std": 0.02507023885846138, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3332, "grad_norm": 4.591573715209961, "kl": 1.305159417912364, "learning_rate": 7.59556165793906e-07, "loss": 0.1305, "num_tokens": 14400200.0, "reward": 0.72979736328125, "reward_std": 0.0060786036774516106, "rewards//mean": 0.72979736328125, "rewards//std": 0.03513595461845398, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3334, "grad_norm": 3.2072863578796387, "kl": 0.8619169108569622, "learning_rate": 7.592848874601963e-07, "loss": 0.0862, "num_tokens": 14408840.0, "reward": 0.76300048828125, "reward_std": 0.00585097074508667, "rewards//mean": 0.76300048828125, "rewards//std": 0.030481424182653427, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3336, "grad_norm": 3.877617835998535, "kl": 1.613358285278082, "learning_rate": 7.590135046865651e-07, "loss": 0.1613, "num_tokens": 14417464.0, "reward": 0.77740478515625, "reward_std": 0.01245960034430027, "rewards//mean": 0.77740478515625, "rewards//std": 0.03658759966492653, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3338, "grad_norm": 2.6459970474243164, "kl": 1.7920551113784313, "learning_rate": 7.587420175823252e-07, "loss": 0.1792, "num_tokens": 14426064.0, "reward": 0.743408203125, "reward_std": 0.011848630383610725, "rewards//mean": 0.743408203125, "rewards//std": 0.03310084342956543, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.334, "grad_norm": 5.083470344543457, "kl": 1.1938681211322546, "learning_rate": 7.584704262568314e-07, "loss": 0.1194, "num_tokens": 14434632.0, "reward": 0.754638671875, "reward_std": 0.008189452812075615, "rewards//mean": 0.754638671875, "rewards//std": 0.030176525935530663, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3342, "grad_norm": 1.880113124847412, "kl": 0.9313200451433659, "learning_rate": 7.581987308194809e-07, "loss": 0.0931, "num_tokens": 14443280.0, "reward": 0.75433349609375, "reward_std": 0.0029908069409430027, "rewards//mean": 0.75433349609375, "rewards//std": 0.030860962346196175, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3344, "grad_norm": 3.1007273197174072, "kl": 1.3674478232860565, "learning_rate": 7.579269313797125e-07, "loss": 0.1367, "num_tokens": 14451808.0, "reward": 0.72784423828125, "reward_std": 0.008877030573785305, "rewards//mean": 0.72784423828125, "rewards//std": 0.027081677690148354, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3346, "grad_norm": 8.102618217468262, "kl": 1.8559861723333597, "learning_rate": 7.576550280470071e-07, "loss": 0.1856, "num_tokens": 14460544.0, "reward": 0.76214599609375, "reward_std": 0.014675160869956017, "rewards//mean": 0.76214599609375, "rewards//std": 0.03505011275410652, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3348, "grad_norm": 2.681720733642578, "kl": 1.2322614286094904, "learning_rate": 7.573830209308872e-07, "loss": 0.1232, "num_tokens": 14469176.0, "reward": 0.73406982421875, "reward_std": 0.004265114665031433, "rewards//mean": 0.73406982421875, "rewards//std": 0.032371558248996735, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.335, "grad_norm": 8.004125595092773, "kl": 1.0956176165491343, "learning_rate": 7.57110910140917e-07, "loss": 0.1096, "num_tokens": 14477800.0, "reward": 0.7530517578125, "reward_std": 0.011142227798700333, "rewards//mean": 0.7530517578125, "rewards//std": 0.026461174711585045, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3352, "grad_norm": 5.298357963562012, "kl": 1.3848022278398275, "learning_rate": 7.568386957867032e-07, "loss": 0.1385, "num_tokens": 14486440.0, "reward": 0.76104736328125, "reward_std": 0.006294413469731808, "rewards//mean": 0.76104736328125, "rewards//std": 0.0307936891913414, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3354, "grad_norm": 5.516765117645264, "kl": 0.8469223249703646, "learning_rate": 7.565663779778933e-07, "loss": 0.0847, "num_tokens": 14495040.0, "reward": 0.73773193359375, "reward_std": 0.004481610842049122, "rewards//mean": 0.73773193359375, "rewards//std": 0.021596727892756462, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3356, "grad_norm": 4.592514991760254, "kl": 1.4531228300184011, "learning_rate": 7.562939568241771e-07, "loss": 0.1453, "num_tokens": 14503640.0, "reward": 0.75836181640625, "reward_std": 0.010041479021310806, "rewards//mean": 0.75836181640625, "rewards//std": 0.018634773790836334, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3358, "grad_norm": 8.456008911132812, "kl": 1.7594390772283077, "learning_rate": 7.560214324352858e-07, "loss": 0.1759, "num_tokens": 14512272.0, "reward": 0.73541259765625, "reward_std": 0.005506291054189205, "rewards//mean": 0.73541259765625, "rewards//std": 0.028307681903243065, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.336, "grad_norm": 4.635588645935059, "kl": 1.3168210051953793, "learning_rate": 7.55748804920992e-07, "loss": 0.1317, "num_tokens": 14520976.0, "reward": 0.75030517578125, "reward_std": 0.010580218397080898, "rewards//mean": 0.75030517578125, "rewards//std": 0.03075236827135086, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3362, "grad_norm": 1.6931532621383667, "kl": 1.3724579811096191, "learning_rate": 7.554760743911103e-07, "loss": 0.1372, "num_tokens": 14529696.0, "reward": 0.737548828125, "reward_std": 0.007787193171679974, "rewards//mean": 0.737548828125, "rewards//std": 0.03272554278373718, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3364, "grad_norm": 1.35236394405365, "kl": 0.5628429744392633, "learning_rate": 7.552032409554962e-07, "loss": 0.0563, "num_tokens": 14538296.0, "reward": 0.770751953125, "reward_std": 0.002668093889951706, "rewards//mean": 0.770751953125, "rewards//std": 0.017453676089644432, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3366, "grad_norm": 2.2473065853118896, "kl": 1.3161607328802347, "learning_rate": 7.549303047240474e-07, "loss": 0.1316, "num_tokens": 14546904.0, "reward": 0.78643798828125, "reward_std": 0.00971379317343235, "rewards//mean": 0.78643798828125, "rewards//std": 0.02911592461168766, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3368, "grad_norm": 2.310183525085449, "kl": 0.814789243042469, "learning_rate": 7.54657265806702e-07, "loss": 0.0815, "num_tokens": 14555512.0, "reward": 0.77813720703125, "reward_std": 0.007041408680379391, "rewards//mean": 0.77813720703125, "rewards//std": 0.029249753803014755, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.337, "grad_norm": 9.518553733825684, "kl": 1.6211869530379772, "learning_rate": 7.543841243134408e-07, "loss": 0.1621, "num_tokens": 14564144.0, "reward": 0.75732421875, "reward_std": 0.010654858313500881, "rewards//mean": 0.75732421875, "rewards//std": 0.03877311199903488, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3372, "grad_norm": 4.3729329109191895, "kl": 1.1276446674019098, "learning_rate": 7.541108803542845e-07, "loss": 0.1128, "num_tokens": 14572752.0, "reward": 0.760498046875, "reward_std": 0.010988160036504269, "rewards//mean": 0.760498046875, "rewards//std": 0.035861365497112274, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3374, "grad_norm": 4.076907157897949, "kl": 1.7543259430676699, "learning_rate": 7.538375340392961e-07, "loss": 0.1754, "num_tokens": 14581416.0, "reward": 0.71630859375, "reward_std": 0.01102924533188343, "rewards//mean": 0.71630859375, "rewards//std": 0.037875697016716, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3376, "grad_norm": 5.360969066619873, "kl": 1.605041479691863, "learning_rate": 7.535640854785791e-07, "loss": 0.1605, "num_tokens": 14590024.0, "reward": 0.7705078125, "reward_std": 0.01736287772655487, "rewards//mean": 0.7705078125, "rewards//std": 0.03298448026180267, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3378, "grad_norm": 3.594897508621216, "kl": 0.8084434028714895, "learning_rate": 7.532905347822791e-07, "loss": 0.0808, "num_tokens": 14598648.0, "reward": 0.7225341796875, "reward_std": 0.005750302225351334, "rewards//mean": 0.7225341796875, "rewards//std": 0.032271627336740494, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.338, "grad_norm": 9.191390991210938, "kl": 1.743047945201397, "learning_rate": 7.530168820605818e-07, "loss": 0.1743, "num_tokens": 14607320.0, "reward": 0.74420166015625, "reward_std": 0.00988863781094551, "rewards//mean": 0.74420166015625, "rewards//std": 0.03369634971022606, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3382, "grad_norm": 3.980365514755249, "kl": 1.3973311893641949, "learning_rate": 7.527431274237149e-07, "loss": 0.1397, "num_tokens": 14616048.0, "reward": 0.729248046875, "reward_std": 0.009896701201796532, "rewards//mean": 0.729248046875, "rewards//std": 0.02880498208105564, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3384, "grad_norm": 5.471235752105713, "kl": 1.5794076099991798, "learning_rate": 7.524692709819463e-07, "loss": 0.1579, "num_tokens": 14624664.0, "reward": 0.7625732421875, "reward_std": 0.017121773213148117, "rewards//mean": 0.7625732421875, "rewards//std": 0.02605532482266426, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3386, "grad_norm": 19.024185180664062, "kl": 1.6618862971663475, "learning_rate": 7.521953128455855e-07, "loss": 0.1662, "num_tokens": 14633264.0, "reward": 0.754150390625, "reward_std": 0.004000131972134113, "rewards//mean": 0.754150390625, "rewards//std": 0.027908140793442726, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3388, "grad_norm": 8.094286918640137, "kl": 2.2584478612989187, "learning_rate": 7.519212531249829e-07, "loss": 0.2258, "num_tokens": 14641864.0, "reward": 0.74407958984375, "reward_std": 0.008400380611419678, "rewards//mean": 0.74407958984375, "rewards//std": 0.025429869070649147, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.339, "grad_norm": 28.27859878540039, "kl": 2.725379491224885, "learning_rate": 7.516470919305298e-07, "loss": 0.2725, "num_tokens": 14650480.0, "reward": 0.72296142578125, "reward_std": 0.012959511019289494, "rewards//mean": 0.72296142578125, "rewards//std": 0.02918447181582451, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3392, "grad_norm": 20.440521240234375, "kl": 1.9849905855953693, "learning_rate": 7.513728293726579e-07, "loss": 0.1985, "num_tokens": 14659040.0, "reward": 0.781005859375, "reward_std": 0.012903118506073952, "rewards//mean": 0.781005859375, "rewards//std": 0.024238893762230873, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3394, "grad_norm": 25.964466094970703, "kl": 1.5389139335602522, "learning_rate": 7.510984655618406e-07, "loss": 0.1539, "num_tokens": 14667672.0, "reward": 0.7200927734375, "reward_std": 0.006359969265758991, "rewards//mean": 0.7200927734375, "rewards//std": 0.029258355498313904, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3396, "grad_norm": 9.197903633117676, "kl": 1.9400007743388414, "learning_rate": 7.508240006085913e-07, "loss": 0.194, "num_tokens": 14676360.0, "reward": 0.75677490234375, "reward_std": 0.007967963814735413, "rewards//mean": 0.75677490234375, "rewards//std": 0.025782210752367973, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3398, "grad_norm": 15.065352439880371, "kl": 2.113749247044325, "learning_rate": 7.505494346234647e-07, "loss": 0.2114, "num_tokens": 14684928.0, "reward": 0.7586669921875, "reward_std": 0.014345825649797916, "rewards//mean": 0.7586669921875, "rewards//std": 0.032980579882860184, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.34, "grad_norm": 19.388952255249023, "kl": 3.091250417754054, "learning_rate": 7.502747677170555e-07, "loss": 0.3091, "num_tokens": 14693568.0, "reward": 0.738525390625, "reward_std": 0.013732793740928173, "rewards//mean": 0.738525390625, "rewards//std": 0.03740192577242851, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3402, "grad_norm": 32.21320724487305, "kl": 3.8705898970365524, "learning_rate": 7.5e-07, "loss": 0.3871, "num_tokens": 14702208.0, "reward": 0.73223876953125, "reward_std": 0.019241712987422943, "rewards//mean": 0.73223876953125, "rewards//std": 0.06249197944998741, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3404, "grad_norm": 7.169406414031982, "kl": 1.371338753029704, "learning_rate": 7.497251315829743e-07, "loss": 0.1371, "num_tokens": 14710888.0, "reward": 0.76019287109375, "reward_std": 0.011022357270121574, "rewards//mean": 0.76019287109375, "rewards//std": 0.0305731613188982, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3406, "grad_norm": 7.21544075012207, "kl": 0.9435538854449987, "learning_rate": 7.494501625766955e-07, "loss": 0.0944, "num_tokens": 14719608.0, "reward": 0.7705078125, "reward_std": 0.0077237319201231, "rewards//mean": 0.7705078125, "rewards//std": 0.030233660712838173, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3408, "grad_norm": 1.1966946125030518, "kl": 0.7175771556794643, "learning_rate": 7.491750930919212e-07, "loss": 0.0718, "num_tokens": 14728232.0, "reward": 0.7764892578125, "reward_std": 0.0036823658738285303, "rewards//mean": 0.7764892578125, "rewards//std": 0.025358112528920174, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.341, "grad_norm": 2.9344372749328613, "kl": 0.6410322058945894, "learning_rate": 7.488999232394491e-07, "loss": 0.0641, "num_tokens": 14736920.0, "reward": 0.7530517578125, "reward_std": 0.003432015422731638, "rewards//mean": 0.7530517578125, "rewards//std": 0.025424884632229805, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3412, "grad_norm": 4.979156970977783, "kl": 1.1780236940830946, "learning_rate": 7.486246531301177e-07, "loss": 0.1178, "num_tokens": 14745584.0, "reward": 0.7303466796875, "reward_std": 0.011336080729961395, "rewards//mean": 0.7303466796875, "rewards//std": 0.0459880605340004, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3414, "grad_norm": 4.931070804595947, "kl": 1.3935081399977207, "learning_rate": 7.483492828748056e-07, "loss": 0.1394, "num_tokens": 14754312.0, "reward": 0.7525634765625, "reward_std": 0.013016480952501297, "rewards//mean": 0.7525634765625, "rewards//std": 0.026532016694545746, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3416, "grad_norm": 1.5224446058273315, "kl": 0.5527310520410538, "learning_rate": 7.480738125844322e-07, "loss": 0.0553, "num_tokens": 14762928.0, "reward": 0.7354736328125, "reward_std": 0.001750173862092197, "rewards//mean": 0.7354736328125, "rewards//std": 0.027370035648345947, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3418, "grad_norm": 1.7535755634307861, "kl": 0.5227664280682802, "learning_rate": 7.477982423699567e-07, "loss": 0.0523, "num_tokens": 14771584.0, "reward": 0.77703857421875, "reward_std": 0.004122797399759293, "rewards//mean": 0.77703857421875, "rewards//std": 0.024835387244820595, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.342, "grad_norm": 1.2541731595993042, "kl": 0.576104111969471, "learning_rate": 7.475225723423788e-07, "loss": 0.0576, "num_tokens": 14780264.0, "reward": 0.76513671875, "reward_std": 0.003349718637764454, "rewards//mean": 0.76513671875, "rewards//std": 0.021760214120149612, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3422, "grad_norm": 4.6338725090026855, "kl": 1.3434299621731043, "learning_rate": 7.472468026127384e-07, "loss": 0.1343, "num_tokens": 14789040.0, "reward": 0.774658203125, "reward_std": 0.010380629450082779, "rewards//mean": 0.774658203125, "rewards//std": 0.028055289760231972, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3424, "grad_norm": 4.566206455230713, "kl": 0.7673952169716358, "learning_rate": 7.469709332921154e-07, "loss": 0.0767, "num_tokens": 14797600.0, "reward": 0.74444580078125, "reward_std": 0.007673834916204214, "rewards//mean": 0.74444580078125, "rewards//std": 0.030063383281230927, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3426, "grad_norm": 1.81694757938385, "kl": 1.1699585411697626, "learning_rate": 7.4669496449163e-07, "loss": 0.117, "num_tokens": 14806248.0, "reward": 0.74298095703125, "reward_std": 0.005607170984148979, "rewards//mean": 0.74298095703125, "rewards//std": 0.01837710663676262, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3428, "grad_norm": 6.147146224975586, "kl": 1.0212355516850948, "learning_rate": 7.464188963224427e-07, "loss": 0.1021, "num_tokens": 14814856.0, "reward": 0.7528076171875, "reward_std": 0.010117903351783752, "rewards//mean": 0.7528076171875, "rewards//std": 0.030028430745005608, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.343, "grad_norm": 2.5232460498809814, "kl": 0.9998667482286692, "learning_rate": 7.461427288957531e-07, "loss": 0.1, "num_tokens": 14823456.0, "reward": 0.75396728515625, "reward_std": 0.00727267749607563, "rewards//mean": 0.75396728515625, "rewards//std": 0.025718722492456436, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3432, "grad_norm": 1.8526498079299927, "kl": 0.9887200873345137, "learning_rate": 7.45866462322802e-07, "loss": 0.0989, "num_tokens": 14832032.0, "reward": 0.77264404296875, "reward_std": 0.003451541531831026, "rewards//mean": 0.77264404296875, "rewards//std": 0.016070950776338577, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3434, "grad_norm": 1.2874479293823242, "kl": 1.0913045555353165, "learning_rate": 7.45590096714869e-07, "loss": 0.1091, "num_tokens": 14840616.0, "reward": 0.7208251953125, "reward_std": 0.008463181555271149, "rewards//mean": 0.7208251953125, "rewards//std": 0.030048588290810585, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3436, "grad_norm": 1.046310305595398, "kl": 0.6895448602735996, "learning_rate": 7.453136321832745e-07, "loss": 0.069, "num_tokens": 14849224.0, "reward": 0.7720947265625, "reward_std": 0.0013896794989705086, "rewards//mean": 0.7720947265625, "rewards//std": 0.015659356489777565, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3438, "grad_norm": 0.39746522903442383, "kl": 0.4428255669772625, "learning_rate": 7.450370688393784e-07, "loss": 0.0443, "num_tokens": 14857824.0, "reward": 0.75555419921875, "reward_std": 0.001083534792996943, "rewards//mean": 0.75555419921875, "rewards//std": 0.02159602753818035, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.344, "grad_norm": 1.386067509651184, "kl": 1.1081818174570799, "learning_rate": 7.447604067945802e-07, "loss": 0.1108, "num_tokens": 14866520.0, "reward": 0.72930908203125, "reward_std": 0.004850580357015133, "rewards//mean": 0.72930908203125, "rewards//std": 0.026790568605065346, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3442, "grad_norm": 4.141593933105469, "kl": 1.4461893811821938, "learning_rate": 7.444836461603194e-07, "loss": 0.1446, "num_tokens": 14875352.0, "reward": 0.75823974609375, "reward_std": 0.009514996781945229, "rewards//mean": 0.75823974609375, "rewards//std": 0.02587481215596199, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3444, "grad_norm": 5.451181888580322, "kl": 1.7047587744891644, "learning_rate": 7.442067870480751e-07, "loss": 0.1705, "num_tokens": 14883920.0, "reward": 0.73529052734375, "reward_std": 0.010152137838304043, "rewards//mean": 0.73529052734375, "rewards//std": 0.038655757904052734, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3446, "grad_norm": 4.6878886222839355, "kl": 1.3176060393452644, "learning_rate": 7.439298295693663e-07, "loss": 0.1318, "num_tokens": 14892592.0, "reward": 0.743896484375, "reward_std": 0.005385981872677803, "rewards//mean": 0.743896484375, "rewards//std": 0.036157313734292984, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3448, "grad_norm": 2.1867740154266357, "kl": 0.7468766365200281, "learning_rate": 7.436527738357513e-07, "loss": 0.0747, "num_tokens": 14901192.0, "reward": 0.76495361328125, "reward_std": 0.00658496655523777, "rewards//mean": 0.76495361328125, "rewards//std": 0.03055681847035885, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.345, "grad_norm": 3.0420939922332764, "kl": 0.6787095963954926, "learning_rate": 7.433756199588282e-07, "loss": 0.0679, "num_tokens": 14909840.0, "reward": 0.778076171875, "reward_std": 0.0028048825915902853, "rewards//mean": 0.778076171875, "rewards//std": 0.021675176918506622, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3452, "grad_norm": 3.7577598094940186, "kl": 1.7973673641681671, "learning_rate": 7.430983680502343e-07, "loss": 0.1797, "num_tokens": 14918520.0, "reward": 0.761962890625, "reward_std": 0.014277311973273754, "rewards//mean": 0.761962890625, "rewards//std": 0.03215061128139496, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3454, "grad_norm": 6.4812750816345215, "kl": 1.8695234637707472, "learning_rate": 7.42821018221647e-07, "loss": 0.187, "num_tokens": 14927264.0, "reward": 0.77349853515625, "reward_std": 0.011050630360841751, "rewards//mean": 0.77349853515625, "rewards//std": 0.04331118240952492, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3456, "grad_norm": 1.821395993232727, "kl": 1.505713665857911, "learning_rate": 7.425435705847825e-07, "loss": 0.1506, "num_tokens": 14935976.0, "reward": 0.76043701171875, "reward_std": 0.009524603374302387, "rewards//mean": 0.76043701171875, "rewards//std": 0.02476458251476288, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3458, "grad_norm": 5.484038829803467, "kl": 1.577188765630126, "learning_rate": 7.422660252513968e-07, "loss": 0.1577, "num_tokens": 14944624.0, "reward": 0.74481201171875, "reward_std": 0.007182367146015167, "rewards//mean": 0.74481201171875, "rewards//std": 0.02699209563434124, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.346, "grad_norm": 2.585268020629883, "kl": 1.7433014456182718, "learning_rate": 7.41988382333285e-07, "loss": 0.1743, "num_tokens": 14953312.0, "reward": 0.74993896484375, "reward_std": 0.011576471850275993, "rewards//mean": 0.74993896484375, "rewards//std": 0.029891187325119972, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3462, "grad_norm": 3.90644907951355, "kl": 1.3983549159020185, "learning_rate": 7.417106419422818e-07, "loss": 0.1398, "num_tokens": 14961880.0, "reward": 0.79949951171875, "reward_std": 0.010875910520553589, "rewards//mean": 0.79949951171875, "rewards//std": 0.023959290236234665, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3464, "grad_norm": 2.3305535316467285, "kl": 1.827426040545106, "learning_rate": 7.41432804190261e-07, "loss": 0.1827, "num_tokens": 14970464.0, "reward": 0.7171630859375, "reward_std": 0.008515611290931702, "rewards//mean": 0.7171630859375, "rewards//std": 0.032156966626644135, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3466, "grad_norm": 2.596547842025757, "kl": 1.9944287464022636, "learning_rate": 7.411548691891357e-07, "loss": 0.1994, "num_tokens": 14979112.0, "reward": 0.76739501953125, "reward_std": 0.011900435201823711, "rewards//mean": 0.76739501953125, "rewards//std": 0.036057062447071075, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3468, "grad_norm": 2.95515775680542, "kl": 2.192312242463231, "learning_rate": 7.408768370508576e-07, "loss": 0.2192, "num_tokens": 14987824.0, "reward": 0.75433349609375, "reward_std": 0.011328080669045448, "rewards//mean": 0.75433349609375, "rewards//std": 0.03066413104534149, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.347, "grad_norm": 0.8474298119544983, "kl": 0.791348984465003, "learning_rate": 7.405987078874185e-07, "loss": 0.0791, "num_tokens": 14996432.0, "reward": 0.75408935546875, "reward_std": 0.003990694880485535, "rewards//mean": 0.75408935546875, "rewards//std": 0.03140794113278389, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3472, "grad_norm": 3.539428472518921, "kl": 1.5238278284668922, "learning_rate": 7.403204818108487e-07, "loss": 0.1524, "num_tokens": 15005096.0, "reward": 0.7723388671875, "reward_std": 0.012794092297554016, "rewards//mean": 0.7723388671875, "rewards//std": 0.03929413482546806, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3474, "grad_norm": 1.4129527807235718, "kl": 1.3678991589695215, "learning_rate": 7.400421589332174e-07, "loss": 0.1368, "num_tokens": 15013688.0, "reward": 0.76708984375, "reward_std": 0.009629899635910988, "rewards//mean": 0.76708984375, "rewards//std": 0.031369902193546295, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3476, "grad_norm": 1.8591399192810059, "kl": 1.5847214739769697, "learning_rate": 7.397637393666333e-07, "loss": 0.1585, "num_tokens": 15022464.0, "reward": 0.74346923828125, "reward_std": 0.007429173681885004, "rewards//mean": 0.74346923828125, "rewards//std": 0.025986313819885254, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3478, "grad_norm": 2.165801763534546, "kl": 1.4061892107129097, "learning_rate": 7.394852232232436e-07, "loss": 0.1406, "num_tokens": 15031144.0, "reward": 0.78155517578125, "reward_std": 0.011306533589959145, "rewards//mean": 0.78155517578125, "rewards//std": 0.02262035384774208, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.348, "grad_norm": 1.9580681324005127, "kl": 1.902137791737914, "learning_rate": 7.392066106152345e-07, "loss": 0.1902, "num_tokens": 15039800.0, "reward": 0.76641845703125, "reward_std": 0.009985348209738731, "rewards//mean": 0.76641845703125, "rewards//std": 0.028384050354361534, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3482, "grad_norm": 0.9027834534645081, "kl": 0.7155110444873571, "learning_rate": 7.389279016548316e-07, "loss": 0.0716, "num_tokens": 15048400.0, "reward": 0.75201416015625, "reward_std": 0.003574079368263483, "rewards//mean": 0.75201416015625, "rewards//std": 0.029528411105275154, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3484, "grad_norm": 2.6010026931762695, "kl": 2.065766640007496, "learning_rate": 7.386490964542982e-07, "loss": 0.2066, "num_tokens": 15057024.0, "reward": 0.7806396484375, "reward_std": 0.012823672965168953, "rewards//mean": 0.7806396484375, "rewards//std": 0.03279462456703186, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3486, "grad_norm": 4.488386154174805, "kl": 1.851673873141408, "learning_rate": 7.383701951259375e-07, "loss": 0.1852, "num_tokens": 15065576.0, "reward": 0.786376953125, "reward_std": 0.007897108793258667, "rewards//mean": 0.786376953125, "rewards//std": 0.026281218975782394, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3488, "grad_norm": 7.884819984436035, "kl": 3.009798128157854, "learning_rate": 7.380911977820906e-07, "loss": 0.301, "num_tokens": 15074264.0, "reward": 0.73211669921875, "reward_std": 0.012890150770545006, "rewards//mean": 0.73211669921875, "rewards//std": 0.027399539947509766, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.349, "grad_norm": 1.4514238834381104, "kl": 1.010293835774064, "learning_rate": 7.378121045351377e-07, "loss": 0.101, "num_tokens": 15082896.0, "reward": 0.74908447265625, "reward_std": 0.002725997706875205, "rewards//mean": 0.74908447265625, "rewards//std": 0.019284680485725403, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3492, "grad_norm": 5.77423620223999, "kl": 2.6011885572224855, "learning_rate": 7.375329154974975e-07, "loss": 0.2601, "num_tokens": 15091528.0, "reward": 0.7545166015625, "reward_std": 0.014566123485565186, "rewards//mean": 0.7545166015625, "rewards//std": 0.02837584912776947, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3494, "grad_norm": 3.199122667312622, "kl": 2.118050239980221, "learning_rate": 7.372536307816272e-07, "loss": 0.2118, "num_tokens": 15100120.0, "reward": 0.7659912109375, "reward_std": 0.009744489565491676, "rewards//mean": 0.7659912109375, "rewards//std": 0.02130145952105522, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3496, "grad_norm": 2.7947630882263184, "kl": 1.1547651756554842, "learning_rate": 7.369742505000231e-07, "loss": 0.1155, "num_tokens": 15108744.0, "reward": 0.76104736328125, "reward_std": 0.007997667416930199, "rewards//mean": 0.76104736328125, "rewards//std": 0.03725060448050499, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3498, "grad_norm": 1.3045270442962646, "kl": 1.191990939900279, "learning_rate": 7.366947747652191e-07, "loss": 0.1192, "num_tokens": 15117312.0, "reward": 0.73974609375, "reward_std": 0.008499959483742714, "rewards//mean": 0.73974609375, "rewards//std": 0.027040911838412285, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.35, "grad_norm": 7.094205379486084, "kl": 0.9772929809987545, "learning_rate": 7.364152036897882e-07, "loss": 0.0977, "num_tokens": 15125920.0, "reward": 0.747314453125, "reward_std": 0.006368104834109545, "rewards//mean": 0.747314453125, "rewards//std": 0.03124128095805645, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3502, "grad_norm": 1.8389047384262085, "kl": 1.2565936334431171, "learning_rate": 7.361355373863413e-07, "loss": 0.1257, "num_tokens": 15134488.0, "reward": 0.740234375, "reward_std": 0.006755336653441191, "rewards//mean": 0.740234375, "rewards//std": 0.03456920012831688, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3504, "grad_norm": 2.2567107677459717, "kl": 1.6556364316493273, "learning_rate": 7.358557759675284e-07, "loss": 0.1656, "num_tokens": 15143216.0, "reward": 0.77337646484375, "reward_std": 0.01561724953353405, "rewards//mean": 0.77337646484375, "rewards//std": 0.03251480311155319, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3506, "grad_norm": 3.0326714515686035, "kl": 2.143301162868738, "learning_rate": 7.35575919546037e-07, "loss": 0.2143, "num_tokens": 15152040.0, "reward": 0.73162841796875, "reward_std": 0.011417457833886147, "rewards//mean": 0.73162841796875, "rewards//std": 0.041722897440195084, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3508, "grad_norm": 6.754617214202881, "kl": 0.7951330132782459, "learning_rate": 7.352959682345935e-07, "loss": 0.0795, "num_tokens": 15160672.0, "reward": 0.74951171875, "reward_std": 0.004714501090347767, "rewards//mean": 0.74951171875, "rewards//std": 0.023556042462587357, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.351, "grad_norm": 1.2019836902618408, "kl": 0.9368169568479061, "learning_rate": 7.350159221459621e-07, "loss": 0.0937, "num_tokens": 15169312.0, "reward": 0.78302001953125, "reward_std": 0.005903866142034531, "rewards//mean": 0.78302001953125, "rewards//std": 0.02406897395849228, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3512, "grad_norm": 1.4883495569229126, "kl": 1.3949045836925507, "learning_rate": 7.347357813929454e-07, "loss": 0.1395, "num_tokens": 15177848.0, "reward": 0.75537109375, "reward_std": 0.004409474320709705, "rewards//mean": 0.75537109375, "rewards//std": 0.021298972889780998, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3514, "grad_norm": 7.7861247062683105, "kl": 1.6032751519232988, "learning_rate": 7.344555460883839e-07, "loss": 0.1603, "num_tokens": 15186416.0, "reward": 0.75677490234375, "reward_std": 0.00679248571395874, "rewards//mean": 0.75677490234375, "rewards//std": 0.023075519129633904, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3516, "grad_norm": 14.671707153320312, "kl": 1.3416200447827578, "learning_rate": 7.341752163451567e-07, "loss": 0.1342, "num_tokens": 15195064.0, "reward": 0.72930908203125, "reward_std": 0.004718040581792593, "rewards//mean": 0.72930908203125, "rewards//std": 0.028979387134313583, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3518, "grad_norm": 3.320373058319092, "kl": 1.6173847205936909, "learning_rate": 7.338947922761802e-07, "loss": 0.1617, "num_tokens": 15203696.0, "reward": 0.75665283203125, "reward_std": 0.011801866814494133, "rewards//mean": 0.75665283203125, "rewards//std": 0.026527095586061478, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.352, "grad_norm": 2.639523983001709, "kl": 1.4315192308276892, "learning_rate": 7.336142739944093e-07, "loss": 0.1432, "num_tokens": 15212416.0, "reward": 0.76043701171875, "reward_std": 0.011404254473745823, "rewards//mean": 0.76043701171875, "rewards//std": 0.031180594116449356, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3522, "grad_norm": 0.6230894327163696, "kl": 0.6868654675781727, "learning_rate": 7.333336616128369e-07, "loss": 0.0687, "num_tokens": 15221032.0, "reward": 0.75579833984375, "reward_std": 0.0021794678177684546, "rewards//mean": 0.75579833984375, "rewards//std": 0.02194301225244999, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3524, "grad_norm": 3.316927194595337, "kl": 1.6013391427695751, "learning_rate": 7.330529552444932e-07, "loss": 0.1601, "num_tokens": 15229704.0, "reward": 0.73968505859375, "reward_std": 0.006840704008936882, "rewards//mean": 0.73968505859375, "rewards//std": 0.030124248936772346, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3526, "grad_norm": 5.732491970062256, "kl": 1.6784944124519825, "learning_rate": 7.327721550024475e-07, "loss": 0.1678, "num_tokens": 15238400.0, "reward": 0.72845458984375, "reward_std": 0.010432370938360691, "rewards//mean": 0.72845458984375, "rewards//std": 0.02905711531639099, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3528, "grad_norm": 3.6753923892974854, "kl": 0.8355032317340374, "learning_rate": 7.324912609998053e-07, "loss": 0.0836, "num_tokens": 15247072.0, "reward": 0.73590087890625, "reward_std": 0.005861599929630756, "rewards//mean": 0.73590087890625, "rewards//std": 0.02998119592666626, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.353, "grad_norm": 2.246016263961792, "kl": 2.139521811157465, "learning_rate": 7.322102733497109e-07, "loss": 0.214, "num_tokens": 15255752.0, "reward": 0.76531982421875, "reward_std": 0.013181580230593681, "rewards//mean": 0.76531982421875, "rewards//std": 0.035613518208265305, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3532, "grad_norm": 3.0697615146636963, "kl": 1.3367989528924227, "learning_rate": 7.319291921653463e-07, "loss": 0.1337, "num_tokens": 15264384.0, "reward": 0.74383544921875, "reward_std": 0.003256019204854965, "rewards//mean": 0.74383544921875, "rewards//std": 0.026582960039377213, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3534, "grad_norm": 1.91382896900177, "kl": 1.5459059569984674, "learning_rate": 7.316480175599308e-07, "loss": 0.1546, "num_tokens": 15273048.0, "reward": 0.77105712890625, "reward_std": 0.009686676785349846, "rewards//mean": 0.77105712890625, "rewards//std": 0.028276648372411728, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3536, "grad_norm": 1.2383081912994385, "kl": 0.6192030981183052, "learning_rate": 7.313667496467215e-07, "loss": 0.0619, "num_tokens": 15281632.0, "reward": 0.75714111328125, "reward_std": 0.004506496712565422, "rewards//mean": 0.75714111328125, "rewards//std": 0.021413005888462067, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3538, "grad_norm": 1.7629684209823608, "kl": 1.344154804944992, "learning_rate": 7.310853885390132e-07, "loss": 0.1344, "num_tokens": 15290240.0, "reward": 0.77435302734375, "reward_std": 0.005163392052054405, "rewards//mean": 0.77435302734375, "rewards//std": 0.021527927368879318, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.354, "grad_norm": 6.2378973960876465, "kl": 2.1335525270551443, "learning_rate": 7.308039343501379e-07, "loss": 0.2134, "num_tokens": 15298944.0, "reward": 0.73681640625, "reward_std": 0.011033887043595314, "rewards//mean": 0.73681640625, "rewards//std": 0.03596673905849457, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3542, "grad_norm": 2.182572841644287, "kl": 1.2096397168934345, "learning_rate": 7.305223871934656e-07, "loss": 0.121, "num_tokens": 15307616.0, "reward": 0.77587890625, "reward_std": 0.007967408746480942, "rewards//mean": 0.77587890625, "rewards//std": 0.022846169769763947, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3544, "grad_norm": 4.277583122253418, "kl": 1.1626602467149496, "learning_rate": 7.302407471824033e-07, "loss": 0.1163, "num_tokens": 15316304.0, "reward": 0.76617431640625, "reward_std": 0.0031837287824600935, "rewards//mean": 0.76617431640625, "rewards//std": 0.0290310550481081, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3546, "grad_norm": 2.832304000854492, "kl": 1.7627367228269577, "learning_rate": 7.299590144303954e-07, "loss": 0.1763, "num_tokens": 15324912.0, "reward": 0.7637939453125, "reward_std": 0.014823010191321373, "rewards//mean": 0.7637939453125, "rewards//std": 0.032234080135822296, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3548, "grad_norm": 8.819801330566406, "kl": 1.3549708854407072, "learning_rate": 7.296771890509242e-07, "loss": 0.1355, "num_tokens": 15333488.0, "reward": 0.772216796875, "reward_std": 0.010270223021507263, "rewards//mean": 0.772216796875, "rewards//std": 0.03110920637845993, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.355, "grad_norm": 2.678743362426758, "kl": 1.418468652293086, "learning_rate": 7.293952711575086e-07, "loss": 0.1418, "num_tokens": 15342088.0, "reward": 0.74639892578125, "reward_std": 0.008328201249241829, "rewards//mean": 0.74639892578125, "rewards//std": 0.038351066410541534, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3552, "grad_norm": 1.8848110437393188, "kl": 1.794282415881753, "learning_rate": 7.291132608637052e-07, "loss": 0.1794, "num_tokens": 15350712.0, "reward": 0.7708740234375, "reward_std": 0.01875242590904236, "rewards//mean": 0.7708740234375, "rewards//std": 0.03361161798238754, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3554, "grad_norm": 3.2355854511260986, "kl": 1.2206679452210665, "learning_rate": 7.288311582831077e-07, "loss": 0.1221, "num_tokens": 15359416.0, "reward": 0.75213623046875, "reward_std": 0.007684601470828056, "rewards//mean": 0.75213623046875, "rewards//std": 0.02200845256447792, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3556, "grad_norm": 2.0275652408599854, "kl": 1.738957367837429, "learning_rate": 7.285489635293471e-07, "loss": 0.1739, "num_tokens": 15368040.0, "reward": 0.7371826171875, "reward_std": 0.0094405896961689, "rewards//mean": 0.7371826171875, "rewards//std": 0.022514790296554565, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3558, "grad_norm": 2.643275737762451, "kl": 2.499020282179117, "learning_rate": 7.282666767160912e-07, "loss": 0.2499, "num_tokens": 15376712.0, "reward": 0.73748779296875, "reward_std": 0.01339271105825901, "rewards//mean": 0.73748779296875, "rewards//std": 0.03584315627813339, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.356, "grad_norm": 2.4836769104003906, "kl": 1.1561640910804272, "learning_rate": 7.279842979570453e-07, "loss": 0.1156, "num_tokens": 15385416.0, "reward": 0.78717041015625, "reward_std": 0.007983257994055748, "rewards//mean": 0.78717041015625, "rewards//std": 0.0198082085698843, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3562, "grad_norm": 1.1851799488067627, "kl": 1.1809126678854227, "learning_rate": 7.277018273659516e-07, "loss": 0.1181, "num_tokens": 15394024.0, "reward": 0.78594970703125, "reward_std": 0.006664501037448645, "rewards//mean": 0.78594970703125, "rewards//std": 0.03354641795158386, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3564, "grad_norm": 3.261087656021118, "kl": 0.7810133509337902, "learning_rate": 7.274192650565889e-07, "loss": 0.0781, "num_tokens": 15402640.0, "reward": 0.7667236328125, "reward_std": 0.0022175246849656105, "rewards//mean": 0.7667236328125, "rewards//std": 0.0193314291536808, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3566, "grad_norm": 4.513838291168213, "kl": 0.8302067779004574, "learning_rate": 7.271366111427734e-07, "loss": 0.083, "num_tokens": 15411344.0, "reward": 0.7559814453125, "reward_std": 0.006560072302818298, "rewards//mean": 0.7559814453125, "rewards//std": 0.02708090841770172, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3568, "grad_norm": 3.3704817295074463, "kl": 1.1860128305852413, "learning_rate": 7.26853865738358e-07, "loss": 0.1186, "num_tokens": 15419992.0, "reward": 0.7576904296875, "reward_std": 0.0033324281685054302, "rewards//mean": 0.7576904296875, "rewards//std": 0.024488355964422226, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.357, "grad_norm": 1.4541735649108887, "kl": 1.462040601298213, "learning_rate": 7.265710289572328e-07, "loss": 0.1462, "num_tokens": 15428584.0, "reward": 0.7762451171875, "reward_std": 0.010556678287684917, "rewards//mean": 0.7762451171875, "rewards//std": 0.03362782672047615, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3572, "grad_norm": 3.4049315452575684, "kl": 1.2772125378251076, "learning_rate": 7.262881009133241e-07, "loss": 0.1277, "num_tokens": 15437216.0, "reward": 0.75726318359375, "reward_std": 0.008723619394004345, "rewards//mean": 0.75726318359375, "rewards//std": 0.027558742091059685, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3574, "grad_norm": 1.7542139291763306, "kl": 1.4295821785926819, "learning_rate": 7.260050817205955e-07, "loss": 0.143, "num_tokens": 15445824.0, "reward": 0.75946044921875, "reward_std": 0.009563788771629333, "rewards//mean": 0.75946044921875, "rewards//std": 0.03559650853276253, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3576, "grad_norm": 2.8776495456695557, "kl": 1.384749609977007, "learning_rate": 7.25721971493047e-07, "loss": 0.1385, "num_tokens": 15454392.0, "reward": 0.75311279296875, "reward_std": 0.007299318909645081, "rewards//mean": 0.75311279296875, "rewards//std": 0.0289082583039999, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3578, "grad_norm": 6.074490547180176, "kl": 1.5374908838421106, "learning_rate": 7.254387703447153e-07, "loss": 0.1537, "num_tokens": 15462984.0, "reward": 0.754150390625, "reward_std": 0.009968824684619904, "rewards//mean": 0.754150390625, "rewards//std": 0.03229343146085739, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.358, "grad_norm": 6.639472484588623, "kl": 2.2980942018330097, "learning_rate": 7.25155478389674e-07, "loss": 0.2298, "num_tokens": 15471616.0, "reward": 0.751953125, "reward_std": 0.010655362159013748, "rewards//mean": 0.751953125, "rewards//std": 0.03024967759847641, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3582, "grad_norm": 1.7258334159851074, "kl": 0.9021014496684074, "learning_rate": 7.248720957420329e-07, "loss": 0.0902, "num_tokens": 15480296.0, "reward": 0.766845703125, "reward_std": 0.004705042112618685, "rewards//mean": 0.766845703125, "rewards//std": 0.023008590564131737, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3584, "grad_norm": 2.6910016536712646, "kl": 1.2031854316592216, "learning_rate": 7.245886225159386e-07, "loss": 0.1203, "num_tokens": 15488912.0, "reward": 0.70916748046875, "reward_std": 0.005398493260145187, "rewards//mean": 0.70916748046875, "rewards//std": 0.04426497966051102, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3586, "grad_norm": 5.5270209312438965, "kl": 1.1625807750970125, "learning_rate": 7.243050588255737e-07, "loss": 0.1163, "num_tokens": 15497536.0, "reward": 0.77923583984375, "reward_std": 0.011963524855673313, "rewards//mean": 0.77923583984375, "rewards//std": 0.031071167439222336, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3588, "grad_norm": 9.999727249145508, "kl": 2.41350987367332, "learning_rate": 7.240214047851581e-07, "loss": 0.2414, "num_tokens": 15506176.0, "reward": 0.73736572265625, "reward_std": 0.008081833831965923, "rewards//mean": 0.73736572265625, "rewards//std": 0.03701702132821083, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.359, "grad_norm": 1.8980357646942139, "kl": 1.4534629695117474, "learning_rate": 7.237376605089476e-07, "loss": 0.1453, "num_tokens": 15514840.0, "reward": 0.76812744140625, "reward_std": 0.00910151470452547, "rewards//mean": 0.76812744140625, "rewards//std": 0.02597058191895485, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3592, "grad_norm": 2.3029587268829346, "kl": 1.6458844356238842, "learning_rate": 7.234538261112341e-07, "loss": 0.1646, "num_tokens": 15523432.0, "reward": 0.75567626953125, "reward_std": 0.01055966503918171, "rewards//mean": 0.75567626953125, "rewards//std": 0.02400852181017399, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3594, "grad_norm": 3.215376377105713, "kl": 1.1200422495603561, "learning_rate": 7.23169901706346e-07, "loss": 0.112, "num_tokens": 15532040.0, "reward": 0.736572265625, "reward_std": 0.007564832456409931, "rewards//mean": 0.736572265625, "rewards//std": 0.03307156264781952, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3596, "grad_norm": 2.5984058380126953, "kl": 1.0757994446903467, "learning_rate": 7.228858874086484e-07, "loss": 0.1076, "num_tokens": 15540624.0, "reward": 0.76824951171875, "reward_std": 0.004476871341466904, "rewards//mean": 0.76824951171875, "rewards//std": 0.01767677441239357, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3598, "grad_norm": 3.9091780185699463, "kl": 1.3355560693889856, "learning_rate": 7.226017833325419e-07, "loss": 0.1336, "num_tokens": 15549200.0, "reward": 0.7333984375, "reward_std": 0.00962809193879366, "rewards//mean": 0.7333984375, "rewards//std": 0.029413418844342232, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.36, "grad_norm": 8.66907024383545, "kl": 0.9297815319150686, "learning_rate": 7.223175895924637e-07, "loss": 0.093, "num_tokens": 15557776.0, "reward": 0.78564453125, "reward_std": 0.004353870637714863, "rewards//mean": 0.78564453125, "rewards//std": 0.022450489923357964, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3602, "grad_norm": 2.6104724407196045, "kl": 2.3057056684046984, "learning_rate": 7.220333063028871e-07, "loss": 0.2306, "num_tokens": 15566328.0, "reward": 0.73406982421875, "reward_std": 0.016842670738697052, "rewards//mean": 0.73406982421875, "rewards//std": 0.030948638916015625, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3604, "grad_norm": 5.116308689117432, "kl": 1.9751359019428492, "learning_rate": 7.217489335783211e-07, "loss": 0.1975, "num_tokens": 15575120.0, "reward": 0.80010986328125, "reward_std": 0.016279060393571854, "rewards//mean": 0.80010986328125, "rewards//std": 0.035129059106111526, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3606, "grad_norm": 1.5109270811080933, "kl": 0.8686944153159857, "learning_rate": 7.214644715333114e-07, "loss": 0.0869, "num_tokens": 15583712.0, "reward": 0.7593994140625, "reward_std": 0.0061800191178917885, "rewards//mean": 0.7593994140625, "rewards//std": 0.02092287689447403, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3608, "grad_norm": 5.090044975280762, "kl": 1.8055401183664799, "learning_rate": 7.211799202824388e-07, "loss": 0.1806, "num_tokens": 15592344.0, "reward": 0.76202392578125, "reward_std": 0.014151292853057384, "rewards//mean": 0.76202392578125, "rewards//std": 0.030499298125505447, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.361, "grad_norm": 2.3456919193267822, "kl": 1.314583534374833, "learning_rate": 7.20895279940321e-07, "loss": 0.1315, "num_tokens": 15600928.0, "reward": 0.7452392578125, "reward_std": 0.0053898547776043415, "rewards//mean": 0.7452392578125, "rewards//std": 0.02513747289776802, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3612, "grad_norm": 1.9758235216140747, "kl": 0.7922122180461884, "learning_rate": 7.206105506216106e-07, "loss": 0.0792, "num_tokens": 15609536.0, "reward": 0.76885986328125, "reward_std": 0.007231271825730801, "rewards//mean": 0.76885986328125, "rewards//std": 0.02681654877960682, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3614, "grad_norm": 4.113847255706787, "kl": 1.960466867312789, "learning_rate": 7.203257324409971e-07, "loss": 0.196, "num_tokens": 15618136.0, "reward": 0.75701904296875, "reward_std": 0.013052722439169884, "rewards//mean": 0.75701904296875, "rewards//std": 0.03603186458349228, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3616, "grad_norm": 2.9570446014404297, "kl": 1.1656327359378338, "learning_rate": 7.200408255132045e-07, "loss": 0.1166, "num_tokens": 15626680.0, "reward": 0.74884033203125, "reward_std": 0.004135049879550934, "rewards//mean": 0.74884033203125, "rewards//std": 0.01934659481048584, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3618, "grad_norm": 1.1830683946609497, "kl": 0.6443019825965166, "learning_rate": 7.19755829952994e-07, "loss": 0.0644, "num_tokens": 15635376.0, "reward": 0.791748046875, "reward_std": 0.0024971095845103264, "rewards//mean": 0.791748046875, "rewards//std": 0.016702014952898026, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.362, "grad_norm": 3.4049994945526123, "kl": 1.9126994479447603, "learning_rate": 7.194707458751615e-07, "loss": 0.1913, "num_tokens": 15644080.0, "reward": 0.75152587890625, "reward_std": 0.010969490744173527, "rewards//mean": 0.75152587890625, "rewards//std": 0.029697103425860405, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3622, "grad_norm": 9.74229907989502, "kl": 3.112483039498329, "learning_rate": 7.191855733945386e-07, "loss": 0.3112, "num_tokens": 15652968.0, "reward": 0.73614501953125, "reward_std": 0.013483827002346516, "rewards//mean": 0.73614501953125, "rewards//std": 0.04093911126255989, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3624, "grad_norm": 1.8979837894439697, "kl": 1.671231472864747, "learning_rate": 7.189003126259931e-07, "loss": 0.1671, "num_tokens": 15661712.0, "reward": 0.7601318359375, "reward_std": 0.01090779434889555, "rewards//mean": 0.7601318359375, "rewards//std": 0.026011131703853607, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3626, "grad_norm": 1.5321110486984253, "kl": 0.9612350445240736, "learning_rate": 7.186149636844279e-07, "loss": 0.0961, "num_tokens": 15670432.0, "reward": 0.77783203125, "reward_std": 0.0065762861631810665, "rewards//mean": 0.77783203125, "rewards//std": 0.02333912067115307, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3628, "grad_norm": 4.263502597808838, "kl": 1.7631815448403358, "learning_rate": 7.183295266847814e-07, "loss": 0.1763, "num_tokens": 15679056.0, "reward": 0.75384521484375, "reward_std": 0.013981279917061329, "rewards//mean": 0.75384521484375, "rewards//std": 0.023276690393686295, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.363, "grad_norm": 2.405259370803833, "kl": 1.0479095336049795, "learning_rate": 7.180440017420276e-07, "loss": 0.1048, "num_tokens": 15687624.0, "reward": 0.7518310546875, "reward_std": 0.005571077577769756, "rewards//mean": 0.7518310546875, "rewards//std": 0.029330700635910034, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3632, "grad_norm": 6.763623237609863, "kl": 1.44739506021142, "learning_rate": 7.177583889711762e-07, "loss": 0.1447, "num_tokens": 15696272.0, "reward": 0.77642822265625, "reward_std": 0.003670833073556423, "rewards//mean": 0.77642822265625, "rewards//std": 0.03301194682717323, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3634, "grad_norm": 0.9895033240318298, "kl": 1.0269442293792963, "learning_rate": 7.174726884872715e-07, "loss": 0.1027, "num_tokens": 15704840.0, "reward": 0.75531005859375, "reward_std": 0.004540668334811926, "rewards//mean": 0.75531005859375, "rewards//std": 0.029243025928735733, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3636, "grad_norm": 3.1822571754455566, "kl": 2.347819235175848, "learning_rate": 7.17186900405394e-07, "loss": 0.2348, "num_tokens": 15713432.0, "reward": 0.75091552734375, "reward_std": 0.021689504384994507, "rewards//mean": 0.75091552734375, "rewards//std": 0.035199228674173355, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3638, "grad_norm": 1.3811274766921997, "kl": 0.5692074857652187, "learning_rate": 7.169010248406588e-07, "loss": 0.0569, "num_tokens": 15722048.0, "reward": 0.75347900390625, "reward_std": 0.0030401148833334446, "rewards//mean": 0.75347900390625, "rewards//std": 0.0257528368383646, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.364, "grad_norm": 1.770247459411621, "kl": 1.1378947533667088, "learning_rate": 7.16615061908217e-07, "loss": 0.1138, "num_tokens": 15730736.0, "reward": 0.739501953125, "reward_std": 0.005134006962180138, "rewards//mean": 0.739501953125, "rewards//std": 0.04060612618923187, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3642, "grad_norm": 1.3821980953216553, "kl": 0.6415708940476179, "learning_rate": 7.163290117232541e-07, "loss": 0.0642, "num_tokens": 15739304.0, "reward": 0.76324462890625, "reward_std": 0.0005179004510864615, "rewards//mean": 0.76324462890625, "rewards//std": 0.021277552470564842, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3644, "grad_norm": 19.419260025024414, "kl": 1.0268391743302345, "learning_rate": 7.160428744009912e-07, "loss": 0.1027, "num_tokens": 15747912.0, "reward": 0.7359619140625, "reward_std": 0.008618786931037903, "rewards//mean": 0.7359619140625, "rewards//std": 0.026609499007463455, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3646, "grad_norm": 3.62929368019104, "kl": 2.081236759200692, "learning_rate": 7.157566500566842e-07, "loss": 0.2081, "num_tokens": 15756560.0, "reward": 0.78173828125, "reward_std": 0.01659890078008175, "rewards//mean": 0.78173828125, "rewards//std": 0.028961164876818657, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3648, "grad_norm": 4.564011573791504, "kl": 1.8572072703391314, "learning_rate": 7.154703388056244e-07, "loss": 0.1857, "num_tokens": 15765136.0, "reward": 0.7550048828125, "reward_std": 0.009905409999191761, "rewards//mean": 0.7550048828125, "rewards//std": 0.028412101790308952, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.365, "grad_norm": 2.2224252223968506, "kl": 1.4804861135780811, "learning_rate": 7.15183940763138e-07, "loss": 0.148, "num_tokens": 15773752.0, "reward": 0.76629638671875, "reward_std": 0.010908014141023159, "rewards//mean": 0.76629638671875, "rewards//std": 0.03380085900425911, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3652, "grad_norm": 5.550025463104248, "kl": 1.1781852692365646, "learning_rate": 7.148974560445858e-07, "loss": 0.1178, "num_tokens": 15782448.0, "reward": 0.80841064453125, "reward_std": 0.010814267210662365, "rewards//mean": 0.80841064453125, "rewards//std": 0.027869535610079765, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3654, "grad_norm": 5.150213241577148, "kl": 2.069683153182268, "learning_rate": 7.146108847653641e-07, "loss": 0.207, "num_tokens": 15791048.0, "reward": 0.74420166015625, "reward_std": 0.011490346863865852, "rewards//mean": 0.74420166015625, "rewards//std": 0.02910708449780941, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3656, "grad_norm": 3.5626261234283447, "kl": 1.5172289572656155, "learning_rate": 7.143242270409037e-07, "loss": 0.1517, "num_tokens": 15799808.0, "reward": 0.74676513671875, "reward_std": 0.012836494483053684, "rewards//mean": 0.74676513671875, "rewards//std": 0.04204024001955986, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3658, "grad_norm": 2.1888387203216553, "kl": 0.7403810862451792, "learning_rate": 7.140374829866702e-07, "loss": 0.074, "num_tokens": 15808392.0, "reward": 0.73809814453125, "reward_std": 0.005442460998892784, "rewards//mean": 0.73809814453125, "rewards//std": 0.022309036925435066, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.366, "grad_norm": 1.5039739608764648, "kl": 1.6122042424976826, "learning_rate": 7.137506527181643e-07, "loss": 0.1612, "num_tokens": 15817048.0, "reward": 0.751220703125, "reward_std": 0.006968109868466854, "rewards//mean": 0.751220703125, "rewards//std": 0.032710738480091095, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3662, "grad_norm": 4.752030372619629, "kl": 1.8018863126635551, "learning_rate": 7.134637363509209e-07, "loss": 0.1802, "num_tokens": 15825664.0, "reward": 0.75128173828125, "reward_std": 0.008976714685559273, "rewards//mean": 0.75128173828125, "rewards//std": 0.027796657755970955, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3664, "grad_norm": 2.9698989391326904, "kl": 1.3148400988429785, "learning_rate": 7.131767340005101e-07, "loss": 0.1315, "num_tokens": 15834296.0, "reward": 0.72882080078125, "reward_std": 0.007888147607445717, "rewards//mean": 0.72882080078125, "rewards//std": 0.02841496467590332, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3666, "grad_norm": 2.261000633239746, "kl": 1.148534793406725, "learning_rate": 7.128896457825363e-07, "loss": 0.1149, "num_tokens": 15842992.0, "reward": 0.77691650390625, "reward_std": 0.011105703189969063, "rewards//mean": 0.77691650390625, "rewards//std": 0.03385500609874725, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3668, "grad_norm": 10.06489086151123, "kl": 1.1229076366871595, "learning_rate": 7.126024718126387e-07, "loss": 0.1123, "num_tokens": 15851704.0, "reward": 0.75543212890625, "reward_std": 0.0066683851182460785, "rewards//mean": 0.75543212890625, "rewards//std": 0.026654046028852463, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.367, "grad_norm": 2.543853759765625, "kl": 0.6991872619837523, "learning_rate": 7.123152122064908e-07, "loss": 0.0699, "num_tokens": 15860272.0, "reward": 0.77618408203125, "reward_std": 0.0065511795692145824, "rewards//mean": 0.77618408203125, "rewards//std": 0.024136804044246674, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3672, "grad_norm": 3.3016326427459717, "kl": 1.433420417830348, "learning_rate": 7.120278670798009e-07, "loss": 0.1433, "num_tokens": 15868880.0, "reward": 0.76513671875, "reward_std": 0.014103761874139309, "rewards//mean": 0.76513671875, "rewards//std": 0.03648821637034416, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3674, "grad_norm": 15.681102752685547, "kl": 2.7154130339622498, "learning_rate": 7.117404365483115e-07, "loss": 0.2715, "num_tokens": 15877640.0, "reward": 0.7822265625, "reward_std": 0.010400941595435143, "rewards//mean": 0.7822265625, "rewards//std": 0.03711019083857536, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3676, "grad_norm": 2.0528388023376465, "kl": 0.8739600479602814, "learning_rate": 7.114529207277995e-07, "loss": 0.0874, "num_tokens": 15886312.0, "reward": 0.75274658203125, "reward_std": 0.004914752207696438, "rewards//mean": 0.75274658203125, "rewards//std": 0.025052646175026894, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3678, "grad_norm": 0.9327261447906494, "kl": 0.6090308651328087, "learning_rate": 7.111653197340764e-07, "loss": 0.0609, "num_tokens": 15894920.0, "reward": 0.73419189453125, "reward_std": 0.002529338002204895, "rewards//mean": 0.73419189453125, "rewards//std": 0.026267895475029945, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.368, "grad_norm": 2.3686490058898926, "kl": 0.9842582866549492, "learning_rate": 7.108776336829876e-07, "loss": 0.0984, "num_tokens": 15903472.0, "reward": 0.77490234375, "reward_std": 0.00263409037142992, "rewards//mean": 0.77490234375, "rewards//std": 0.01799515075981617, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3682, "grad_norm": 16.48367691040039, "kl": 1.4165671542286873, "learning_rate": 7.105898626904134e-07, "loss": 0.1417, "num_tokens": 15912104.0, "reward": 0.79345703125, "reward_std": 0.010839138180017471, "rewards//mean": 0.79345703125, "rewards//std": 0.022146357223391533, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3684, "grad_norm": 19.49656105041504, "kl": 1.8708918560296297, "learning_rate": 7.103020068722674e-07, "loss": 0.1871, "num_tokens": 15920808.0, "reward": 0.7288818359375, "reward_std": 0.011415512301027775, "rewards//mean": 0.7288818359375, "rewards//std": 0.03764619305729866, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3686, "grad_norm": 2.1908531188964844, "kl": 2.2299514431506395, "learning_rate": 7.100140663444984e-07, "loss": 0.223, "num_tokens": 15929496.0, "reward": 0.74114990234375, "reward_std": 0.01153961569070816, "rewards//mean": 0.74114990234375, "rewards//std": 0.0386400930583477, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3688, "grad_norm": 1.8744713068008423, "kl": 1.1681729834526777, "learning_rate": 7.097260412230885e-07, "loss": 0.1168, "num_tokens": 15938136.0, "reward": 0.755126953125, "reward_std": 0.00805087760090828, "rewards//mean": 0.755126953125, "rewards//std": 0.02592865191400051, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.369, "grad_norm": 3.0080108642578125, "kl": 1.4773118402808905, "learning_rate": 7.094379316240544e-07, "loss": 0.1477, "num_tokens": 15946768.0, "reward": 0.73944091796875, "reward_std": 0.005808740388602018, "rewards//mean": 0.73944091796875, "rewards//std": 0.02850751020014286, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3692, "grad_norm": 3.1908085346221924, "kl": 1.072961589321494, "learning_rate": 7.091497376634463e-07, "loss": 0.1073, "num_tokens": 15955440.0, "reward": 0.7869873046875, "reward_std": 0.007595873903483152, "rewards//mean": 0.7869873046875, "rewards//std": 0.028713131323456764, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3694, "grad_norm": 2.3255116939544678, "kl": 1.105245677754283, "learning_rate": 7.088614594573491e-07, "loss": 0.1105, "num_tokens": 15964048.0, "reward": 0.76324462890625, "reward_std": 0.008902833797037601, "rewards//mean": 0.76324462890625, "rewards//std": 0.034115131944417953, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3696, "grad_norm": 2.5090808868408203, "kl": 1.824024187400937, "learning_rate": 7.085730971218809e-07, "loss": 0.1824, "num_tokens": 15972664.0, "reward": 0.7740478515625, "reward_std": 0.01558766234666109, "rewards//mean": 0.7740478515625, "rewards//std": 0.025725562125444412, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3698, "grad_norm": 1.9827549457550049, "kl": 1.789289928972721, "learning_rate": 7.082846507731941e-07, "loss": 0.1789, "num_tokens": 15981304.0, "reward": 0.73907470703125, "reward_std": 0.012560350820422173, "rewards//mean": 0.73907470703125, "rewards//std": 0.031850166618824005, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.37, "grad_norm": 20.470462799072266, "kl": 2.968334635719657, "learning_rate": 7.079961205274748e-07, "loss": 0.2968, "num_tokens": 15989976.0, "reward": 0.78729248046875, "reward_std": 0.020587624981999397, "rewards//mean": 0.78729248046875, "rewards//std": 0.035908978432416916, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3702, "grad_norm": 2.3084218502044678, "kl": 1.7427243553102016, "learning_rate": 7.077075065009433e-07, "loss": 0.1743, "num_tokens": 15998608.0, "reward": 0.788330078125, "reward_std": 0.009603803977370262, "rewards//mean": 0.788330078125, "rewards//std": 0.035131268203258514, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3704, "grad_norm": 6.136078834533691, "kl": 1.7758771125227213, "learning_rate": 7.074188088098527e-07, "loss": 0.1776, "num_tokens": 16007184.0, "reward": 0.7642822265625, "reward_std": 0.009786337614059448, "rewards//mean": 0.7642822265625, "rewards//std": 0.03215320408344269, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3706, "grad_norm": 1.8461768627166748, "kl": 1.7065285798162222, "learning_rate": 7.071300275704909e-07, "loss": 0.1707, "num_tokens": 16015936.0, "reward": 0.7442626953125, "reward_std": 0.009451361373066902, "rewards//mean": 0.7442626953125, "rewards//std": 0.025094076991081238, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3708, "grad_norm": 8.691837310791016, "kl": 1.6497365441173315, "learning_rate": 7.068411628991787e-07, "loss": 0.165, "num_tokens": 16024664.0, "reward": 0.76043701171875, "reward_std": 0.0033445670269429684, "rewards//mean": 0.76043701171875, "rewards//std": 0.037383656948804855, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.371, "grad_norm": 1.724465250968933, "kl": 0.8312657419592142, "learning_rate": 7.065522149122709e-07, "loss": 0.0831, "num_tokens": 16033264.0, "reward": 0.731689453125, "reward_std": 0.005383252166211605, "rewards//mean": 0.731689453125, "rewards//std": 0.029967118054628372, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3712, "grad_norm": 1.1823596954345703, "kl": 0.9029164835810661, "learning_rate": 7.062631837261556e-07, "loss": 0.0903, "num_tokens": 16041776.0, "reward": 0.79107666015625, "reward_std": 0.005362005904316902, "rewards//mean": 0.79107666015625, "rewards//std": 0.021327294409275055, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3714, "grad_norm": 0.8451175093650818, "kl": 1.0803232304751873, "learning_rate": 7.059740694572545e-07, "loss": 0.108, "num_tokens": 16050400.0, "reward": 0.744140625, "reward_std": 0.003940070513635874, "rewards//mean": 0.744140625, "rewards//std": 0.020564204081892967, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3716, "grad_norm": 3.1342833042144775, "kl": 2.3297573048621416, "learning_rate": 7.056848722220228e-07, "loss": 0.233, "num_tokens": 16059080.0, "reward": 0.7744140625, "reward_std": 0.013841914013028145, "rewards//mean": 0.7744140625, "rewards//std": 0.029048843309283257, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3718, "grad_norm": 1.7096662521362305, "kl": 0.774777602404356, "learning_rate": 7.053955921369493e-07, "loss": 0.0775, "num_tokens": 16067680.0, "reward": 0.7767333984375, "reward_std": 0.0027954974211752415, "rewards//mean": 0.7767333984375, "rewards//std": 0.018897438421845436, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.372, "grad_norm": 5.2674994468688965, "kl": 1.6537601090967655, "learning_rate": 7.051062293185559e-07, "loss": 0.1654, "num_tokens": 16076320.0, "reward": 0.7572021484375, "reward_std": 0.014470890164375305, "rewards//mean": 0.7572021484375, "rewards//std": 0.036306675523519516, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3722, "grad_norm": 2.3366708755493164, "kl": 1.874405587092042, "learning_rate": 7.048167838833976e-07, "loss": 0.1874, "num_tokens": 16084880.0, "reward": 0.74871826171875, "reward_std": 0.00786502193659544, "rewards//mean": 0.74871826171875, "rewards//std": 0.027604296803474426, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3724, "grad_norm": 4.2101545333862305, "kl": 1.4288507923483849, "learning_rate": 7.045272559480635e-07, "loss": 0.1429, "num_tokens": 16093472.0, "reward": 0.743896484375, "reward_std": 0.01189388521015644, "rewards//mean": 0.743896484375, "rewards//std": 0.02821025624871254, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3726, "grad_norm": 6.675228118896484, "kl": 2.042024416849017, "learning_rate": 7.042376456291751e-07, "loss": 0.2042, "num_tokens": 16102144.0, "reward": 0.7470703125, "reward_std": 0.008256803266704082, "rewards//mean": 0.7470703125, "rewards//std": 0.0385444313287735, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3728, "grad_norm": 2.311246871948242, "kl": 1.6616391614079475, "learning_rate": 7.039479530433874e-07, "loss": 0.1662, "num_tokens": 16110744.0, "reward": 0.7674560546875, "reward_std": 0.009082126431167126, "rewards//mean": 0.7674560546875, "rewards//std": 0.02785683609545231, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.373, "grad_norm": 2.56750226020813, "kl": 1.7200164832174778, "learning_rate": 7.036581783073887e-07, "loss": 0.172, "num_tokens": 16119360.0, "reward": 0.73394775390625, "reward_std": 0.010016953572630882, "rewards//mean": 0.73394775390625, "rewards//std": 0.033682867884635925, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3732, "grad_norm": 9.25973129272461, "kl": 1.2967899721115828, "learning_rate": 7.033683215379002e-07, "loss": 0.1297, "num_tokens": 16128112.0, "reward": 0.7705078125, "reward_std": 0.007930691353976727, "rewards//mean": 0.7705078125, "rewards//std": 0.030480990186333656, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3734, "grad_norm": 5.735073089599609, "kl": 1.5535210128873587, "learning_rate": 7.030783828516759e-07, "loss": 0.1554, "num_tokens": 16136808.0, "reward": 0.7532958984375, "reward_std": 0.007747113239020109, "rewards//mean": 0.7532958984375, "rewards//std": 0.027270298451185226, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3736, "grad_norm": 5.924224376678467, "kl": 2.225294578820467, "learning_rate": 7.027883623655034e-07, "loss": 0.2225, "num_tokens": 16145360.0, "reward": 0.76019287109375, "reward_std": 0.011357331648468971, "rewards//mean": 0.76019287109375, "rewards//std": 0.02892867475748062, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3738, "grad_norm": 7.374082565307617, "kl": 1.942330228164792, "learning_rate": 7.024982601962026e-07, "loss": 0.1942, "num_tokens": 16154056.0, "reward": 0.75714111328125, "reward_std": 0.005384392105042934, "rewards//mean": 0.75714111328125, "rewards//std": 0.023994645103812218, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.374, "grad_norm": 3.31447696685791, "kl": 1.5992409456521273, "learning_rate": 7.022080764606271e-07, "loss": 0.1599, "num_tokens": 16162616.0, "reward": 0.73419189453125, "reward_std": 0.012074579484760761, "rewards//mean": 0.73419189453125, "rewards//std": 0.03620119392871857, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3742, "grad_norm": 3.434015989303589, "kl": 2.0065502747893333, "learning_rate": 7.019178112756625e-07, "loss": 0.2007, "num_tokens": 16171304.0, "reward": 0.7576904296875, "reward_std": 0.012057574465870857, "rewards//mean": 0.7576904296875, "rewards//std": 0.03374645859003067, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3744, "grad_norm": 1.9910322427749634, "kl": 0.966529119759798, "learning_rate": 7.016274647582276e-07, "loss": 0.0967, "num_tokens": 16179984.0, "reward": 0.78094482421875, "reward_std": 0.003195766592398286, "rewards//mean": 0.78094482421875, "rewards//std": 0.02564150094985962, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3746, "grad_norm": 3.2058041095733643, "kl": 1.9518991596996784, "learning_rate": 7.013370370252739e-07, "loss": 0.1952, "num_tokens": 16188520.0, "reward": 0.77301025390625, "reward_std": 0.00646575540304184, "rewards//mean": 0.77301025390625, "rewards//std": 0.01578104868531227, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3748, "grad_norm": 11.18813705444336, "kl": 1.291033249348402, "learning_rate": 7.010465281937858e-07, "loss": 0.1291, "num_tokens": 16197088.0, "reward": 0.78656005859375, "reward_std": 0.007830414921045303, "rewards//mean": 0.78656005859375, "rewards//std": 0.019861631095409393, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.375, "grad_norm": 47.68659973144531, "kl": 3.9973225481808186, "learning_rate": 7.007559383807802e-07, "loss": 0.3997, "num_tokens": 16205768.0, "reward": 0.74432373046875, "reward_std": 0.013239345513284206, "rewards//mean": 0.74432373046875, "rewards//std": 0.03863813355565071, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3752, "grad_norm": 22.866840362548828, "kl": 1.9784103631973267, "learning_rate": 7.004652677033068e-07, "loss": 0.1978, "num_tokens": 16214376.0, "reward": 0.760009765625, "reward_std": 0.014567185193300247, "rewards//mean": 0.760009765625, "rewards//std": 0.025561751797795296, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3754, "grad_norm": 13.356947898864746, "kl": 2.0567911826074123, "learning_rate": 7.001745162784475e-07, "loss": 0.2057, "num_tokens": 16222952.0, "reward": 0.7266845703125, "reward_std": 0.004197565373033285, "rewards//mean": 0.7266845703125, "rewards//std": 0.021451586857438087, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3756, "grad_norm": 8.387414932250977, "kl": 1.653434380888939, "learning_rate": 6.998836842233169e-07, "loss": 0.1653, "num_tokens": 16231624.0, "reward": 0.77984619140625, "reward_std": 0.008104367181658745, "rewards//mean": 0.77984619140625, "rewards//std": 0.03065030463039875, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3758, "grad_norm": 1.5858031511306763, "kl": 0.946024801582098, "learning_rate": 6.995927716550622e-07, "loss": 0.0946, "num_tokens": 16240272.0, "reward": 0.7742919921875, "reward_std": 0.007378405425697565, "rewards//mean": 0.7742919921875, "rewards//std": 0.023882482200860977, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.376, "grad_norm": 2.5536181926727295, "kl": 1.0827032681554556, "learning_rate": 6.99301778690863e-07, "loss": 0.1083, "num_tokens": 16248920.0, "reward": 0.7535400390625, "reward_std": 0.007021937519311905, "rewards//mean": 0.7535400390625, "rewards//std": 0.030062692239880562, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3762, "grad_norm": 13.058320045471191, "kl": 1.2034966554492712, "learning_rate": 6.990107054479312e-07, "loss": 0.1203, "num_tokens": 16257576.0, "reward": 0.763427734375, "reward_std": 0.005399170331656933, "rewards//mean": 0.763427734375, "rewards//std": 0.03184785321354866, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3764, "grad_norm": 0.7369622588157654, "kl": 0.800319992005825, "learning_rate": 6.987195520435109e-07, "loss": 0.08, "num_tokens": 16266248.0, "reward": 0.73876953125, "reward_std": 0.0027755668852478266, "rewards//mean": 0.73876953125, "rewards//std": 0.02507023885846138, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3766, "grad_norm": 1.3183344602584839, "kl": 0.9885077476501465, "learning_rate": 6.984283185948789e-07, "loss": 0.0989, "num_tokens": 16274880.0, "reward": 0.76861572265625, "reward_std": 0.006070063915103674, "rewards//mean": 0.76861572265625, "rewards//std": 0.03312912955880165, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3768, "grad_norm": 12.471165657043457, "kl": 1.6292383763939142, "learning_rate": 6.981370052193439e-07, "loss": 0.1629, "num_tokens": 16283720.0, "reward": 0.77667236328125, "reward_std": 0.009694737382233143, "rewards//mean": 0.77667236328125, "rewards//std": 0.031262047588825226, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.377, "grad_norm": 2.3145735263824463, "kl": 1.2088876217603683, "learning_rate": 6.978456120342469e-07, "loss": 0.1209, "num_tokens": 16292424.0, "reward": 0.783447265625, "reward_std": 0.008539421483874321, "rewards//mean": 0.783447265625, "rewards//std": 0.02722279727458954, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3772, "grad_norm": 12.633748054504395, "kl": 1.3388360384851694, "learning_rate": 6.975541391569609e-07, "loss": 0.1339, "num_tokens": 16301016.0, "reward": 0.75921630859375, "reward_std": 0.012317837215960026, "rewards//mean": 0.75921630859375, "rewards//std": 0.03363429754972458, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3774, "grad_norm": 7.928569793701172, "kl": 1.8477709367871284, "learning_rate": 6.972625867048914e-07, "loss": 0.1848, "num_tokens": 16309584.0, "reward": 0.755859375, "reward_std": 0.009783722460269928, "rewards//mean": 0.755859375, "rewards//std": 0.035495635122060776, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3776, "grad_norm": 7.905439376831055, "kl": 1.8555166963487864, "learning_rate": 6.969709547954755e-07, "loss": 0.1856, "num_tokens": 16318192.0, "reward": 0.77197265625, "reward_std": 0.009541463106870651, "rewards//mean": 0.77197265625, "rewards//std": 0.03386489674448967, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3778, "grad_norm": 1.6442062854766846, "kl": 0.5479441192001104, "learning_rate": 6.966792435461826e-07, "loss": 0.0548, "num_tokens": 16326832.0, "reward": 0.7723388671875, "reward_std": 0.0031074027065187693, "rewards//mean": 0.7723388671875, "rewards//std": 0.023077895864844322, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.378, "grad_norm": 7.154458999633789, "kl": 1.1153011620044708, "learning_rate": 6.963874530745139e-07, "loss": 0.1115, "num_tokens": 16335400.0, "reward": 0.740966796875, "reward_std": 0.009196332655847073, "rewards//mean": 0.740966796875, "rewards//std": 0.02685553953051567, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3782, "grad_norm": 2.1894285678863525, "kl": 1.2471293229609728, "learning_rate": 6.960955834980027e-07, "loss": 0.1247, "num_tokens": 16344080.0, "reward": 0.74151611328125, "reward_std": 0.006976440083235502, "rewards//mean": 0.74151611328125, "rewards//std": 0.02658979222178459, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3784, "grad_norm": 12.276518821716309, "kl": 3.3270397186279297, "learning_rate": 6.958036349342139e-07, "loss": 0.3327, "num_tokens": 16352640.0, "reward": 0.7413330078125, "reward_std": 0.016672108322381973, "rewards//mean": 0.7413330078125, "rewards//std": 0.03904059901833534, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3786, "grad_norm": 2.6406450271606445, "kl": 0.9585860967636108, "learning_rate": 6.955116075007442e-07, "loss": 0.0959, "num_tokens": 16361280.0, "reward": 0.76934814453125, "reward_std": 0.009336546994745731, "rewards//mean": 0.76934814453125, "rewards//std": 0.02935926616191864, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3788, "grad_norm": 7.74597692489624, "kl": 1.2267329227179289, "learning_rate": 6.952195013152225e-07, "loss": 0.1227, "num_tokens": 16369872.0, "reward": 0.764892578125, "reward_std": 0.008651645854115486, "rewards//mean": 0.764892578125, "rewards//std": 0.025590162724256516, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.379, "grad_norm": 1.802874207496643, "kl": 0.8312840610742569, "learning_rate": 6.94927316495309e-07, "loss": 0.0831, "num_tokens": 16378512.0, "reward": 0.77117919921875, "reward_std": 0.006767125800251961, "rewards//mean": 0.77117919921875, "rewards//std": 0.019172104075551033, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3792, "grad_norm": 2.32957124710083, "kl": 1.7574926782399416, "learning_rate": 6.946350531586957e-07, "loss": 0.1757, "num_tokens": 16387160.0, "reward": 0.75518798828125, "reward_std": 0.011470545083284378, "rewards//mean": 0.75518798828125, "rewards//std": 0.03320443630218506, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3794, "grad_norm": 5.658195495605469, "kl": 2.0864341594278812, "learning_rate": 6.943427114231063e-07, "loss": 0.2086, "num_tokens": 16395784.0, "reward": 0.76605224609375, "reward_std": 0.008342149667441845, "rewards//mean": 0.76605224609375, "rewards//std": 0.015595788136124611, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3796, "grad_norm": 1.0706034898757935, "kl": 0.6285277456045151, "learning_rate": 6.94050291406296e-07, "loss": 0.0629, "num_tokens": 16404432.0, "reward": 0.7647705078125, "reward_std": 0.0038165440782904625, "rewards//mean": 0.7647705078125, "rewards//std": 0.025774944573640823, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3798, "grad_norm": 2.2915685176849365, "kl": 1.2932017892599106, "learning_rate": 6.937577932260514e-07, "loss": 0.1293, "num_tokens": 16413160.0, "reward": 0.76568603515625, "reward_std": 0.00904356874525547, "rewards//mean": 0.76568603515625, "rewards//std": 0.02796551026403904, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.38, "grad_norm": 4.388787269592285, "kl": 2.3041618540883064, "learning_rate": 6.93465217000191e-07, "loss": 0.2304, "num_tokens": 16421768.0, "reward": 0.73626708984375, "reward_std": 0.017494281753897667, "rewards//mean": 0.73626708984375, "rewards//std": 0.04452718421816826, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3802, "grad_norm": 2.6694602966308594, "kl": 0.9762528147548437, "learning_rate": 6.931725628465642e-07, "loss": 0.0976, "num_tokens": 16430400.0, "reward": 0.74591064453125, "reward_std": 0.0068107424303889275, "rewards//mean": 0.74591064453125, "rewards//std": 0.026350749656558037, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3804, "grad_norm": 1.481976866722107, "kl": 1.0082335472106934, "learning_rate": 6.928798308830523e-07, "loss": 0.1008, "num_tokens": 16439048.0, "reward": 0.7430419921875, "reward_std": 0.007673492655158043, "rewards//mean": 0.7430419921875, "rewards//std": 0.03300994262099266, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3806, "grad_norm": 2.5784013271331787, "kl": 1.025176851078868, "learning_rate": 6.925870212275676e-07, "loss": 0.1025, "num_tokens": 16447656.0, "reward": 0.72705078125, "reward_std": 0.010677337646484375, "rewards//mean": 0.72705078125, "rewards//std": 0.03812427446246147, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3808, "grad_norm": 2.5363693237304688, "kl": 1.1179410461336374, "learning_rate": 6.922941339980537e-07, "loss": 0.1118, "num_tokens": 16456272.0, "reward": 0.76568603515625, "reward_std": 0.005083904135972261, "rewards//mean": 0.76568603515625, "rewards//std": 0.0326782688498497, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.381, "grad_norm": 4.336783409118652, "kl": 1.889224173501134, "learning_rate": 6.920011693124856e-07, "loss": 0.1889, "num_tokens": 16464952.0, "reward": 0.797119140625, "reward_std": 0.016552994027733803, "rewards//mean": 0.797119140625, "rewards//std": 0.03982317075133324, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3812, "grad_norm": 2.7505311965942383, "kl": 1.2263763677328825, "learning_rate": 6.917081272888696e-07, "loss": 0.1226, "num_tokens": 16473592.0, "reward": 0.78375244140625, "reward_std": 0.011583936400711536, "rewards//mean": 0.78375244140625, "rewards//std": 0.030941301956772804, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3814, "grad_norm": 3.5298731327056885, "kl": 0.8228543344885111, "learning_rate": 6.914150080452428e-07, "loss": 0.0823, "num_tokens": 16482232.0, "reward": 0.7437744140625, "reward_std": 0.004534014966338873, "rewards//mean": 0.7437744140625, "rewards//std": 0.0359480045735836, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3816, "grad_norm": 1.8533520698547363, "kl": 0.9186171144247055, "learning_rate": 6.911218116996736e-07, "loss": 0.0919, "num_tokens": 16490856.0, "reward": 0.76287841796875, "reward_std": 0.005424214527010918, "rewards//mean": 0.76287841796875, "rewards//std": 0.020167943090200424, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3818, "grad_norm": 4.732398986816406, "kl": 2.199810292571783, "learning_rate": 6.908285383702616e-07, "loss": 0.22, "num_tokens": 16499488.0, "reward": 0.767333984375, "reward_std": 0.016987569630146027, "rewards//mean": 0.767333984375, "rewards//std": 0.0346662774682045, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.382, "grad_norm": 0.935878336429596, "kl": 0.6690200604498386, "learning_rate": 6.905351881751371e-07, "loss": 0.0669, "num_tokens": 16508064.0, "reward": 0.76055908203125, "reward_std": 0.0030061921570450068, "rewards//mean": 0.76055908203125, "rewards//std": 0.03105313703417778, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3822, "grad_norm": 3.0205533504486084, "kl": 0.7351449299603701, "learning_rate": 6.902417612324615e-07, "loss": 0.0735, "num_tokens": 16516688.0, "reward": 0.71868896484375, "reward_std": 0.002594362013041973, "rewards//mean": 0.71868896484375, "rewards//std": 0.026543639600276947, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3824, "grad_norm": 2.0582287311553955, "kl": 0.8999587371945381, "learning_rate": 6.899482576604274e-07, "loss": 0.09, "num_tokens": 16525384.0, "reward": 0.71905517578125, "reward_std": 0.005210091359913349, "rewards//mean": 0.71905517578125, "rewards//std": 0.02900966815650463, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3826, "grad_norm": 3.1458547115325928, "kl": 1.3339712284505367, "learning_rate": 6.896546775772576e-07, "loss": 0.1334, "num_tokens": 16533976.0, "reward": 0.75115966796875, "reward_std": 0.005288252606987953, "rewards//mean": 0.75115966796875, "rewards//std": 0.029344825074076653, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3828, "grad_norm": 5.858672618865967, "kl": 1.3377245664596558, "learning_rate": 6.893610211012066e-07, "loss": 0.1338, "num_tokens": 16542632.0, "reward": 0.74371337890625, "reward_std": 0.005274464376270771, "rewards//mean": 0.74371337890625, "rewards//std": 0.024075891822576523, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.383, "grad_norm": 1.4700431823730469, "kl": 0.9460238479077816, "learning_rate": 6.890672883505588e-07, "loss": 0.0946, "num_tokens": 16551320.0, "reward": 0.80389404296875, "reward_std": 0.00816527009010315, "rewards//mean": 0.80389404296875, "rewards//std": 0.03156755119562149, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3832, "grad_norm": 5.436630725860596, "kl": 1.1437436919659376, "learning_rate": 6.887734794436299e-07, "loss": 0.1144, "num_tokens": 16559944.0, "reward": 0.79931640625, "reward_std": 0.013120634481310844, "rewards//mean": 0.79931640625, "rewards//std": 0.02296249009668827, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3834, "grad_norm": 7.216220378875732, "kl": 2.6739409286528826, "learning_rate": 6.884795944987661e-07, "loss": 0.2674, "num_tokens": 16568616.0, "reward": 0.73291015625, "reward_std": 0.016908852383494377, "rewards//mean": 0.73291015625, "rewards//std": 0.04493063688278198, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3836, "grad_norm": 2.0758872032165527, "kl": 1.1840754691511393, "learning_rate": 6.881856336343441e-07, "loss": 0.1184, "num_tokens": 16577184.0, "reward": 0.77850341796875, "reward_std": 0.010706901550292969, "rewards//mean": 0.77850341796875, "rewards//std": 0.024266904219985008, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3838, "grad_norm": 2.5120158195495605, "kl": 1.9357288107275963, "learning_rate": 6.878915969687714e-07, "loss": 0.1936, "num_tokens": 16585896.0, "reward": 0.75335693359375, "reward_std": 0.011784134432673454, "rewards//mean": 0.75335693359375, "rewards//std": 0.025637367740273476, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.384, "grad_norm": 4.904962062835693, "kl": 1.3946696668863297, "learning_rate": 6.875974846204858e-07, "loss": 0.1395, "num_tokens": 16594656.0, "reward": 0.76751708984375, "reward_std": 0.011228116229176521, "rewards//mean": 0.76751708984375, "rewards//std": 0.028981998562812805, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3842, "grad_norm": 2.127612590789795, "kl": 1.453100511804223, "learning_rate": 6.87303296707956e-07, "loss": 0.1453, "num_tokens": 16603224.0, "reward": 0.78070068359375, "reward_std": 0.010628015734255314, "rewards//mean": 0.78070068359375, "rewards//std": 0.03292470797896385, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3844, "grad_norm": 1.864574909210205, "kl": 1.8873520381748676, "learning_rate": 6.870090333496806e-07, "loss": 0.1887, "num_tokens": 16611944.0, "reward": 0.75738525390625, "reward_std": 0.010470920242369175, "rewards//mean": 0.75738525390625, "rewards//std": 0.01807059533894062, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3846, "grad_norm": 3.190722703933716, "kl": 1.9334096498787403, "learning_rate": 6.867146946641891e-07, "loss": 0.1933, "num_tokens": 16620656.0, "reward": 0.755859375, "reward_std": 0.014163737185299397, "rewards//mean": 0.755859375, "rewards//std": 0.025610268115997314, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3848, "grad_norm": 4.621662139892578, "kl": 0.9856888316571712, "learning_rate": 6.864202807700407e-07, "loss": 0.0986, "num_tokens": 16629280.0, "reward": 0.775634765625, "reward_std": 0.0029898949433118105, "rewards//mean": 0.775634765625, "rewards//std": 0.022051827982068062, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.385, "grad_norm": 2.4850211143493652, "kl": 2.518883015960455, "learning_rate": 6.861257917858257e-07, "loss": 0.2519, "num_tokens": 16637840.0, "reward": 0.75048828125, "reward_std": 0.018755264580249786, "rewards//mean": 0.75048828125, "rewards//std": 0.03805432841181755, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3852, "grad_norm": 3.111382484436035, "kl": 1.5108758825808764, "learning_rate": 6.858312278301637e-07, "loss": 0.1511, "num_tokens": 16646456.0, "reward": 0.758544921875, "reward_std": 0.007399954367429018, "rewards//mean": 0.758544921875, "rewards//std": 0.028398511931300163, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3854, "grad_norm": 1.1277971267700195, "kl": 1.0972880274057388, "learning_rate": 6.855365890217056e-07, "loss": 0.1097, "num_tokens": 16655072.0, "reward": 0.7708740234375, "reward_std": 0.007282741833478212, "rewards//mean": 0.7708740234375, "rewards//std": 0.028791053220629692, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3856, "grad_norm": 2.733692169189453, "kl": 1.738442301750183, "learning_rate": 6.852418754791316e-07, "loss": 0.1738, "num_tokens": 16663784.0, "reward": 0.82855224609375, "reward_std": 0.009084641002118587, "rewards//mean": 0.82855224609375, "rewards//std": 0.020506612956523895, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3858, "grad_norm": 8.57189655303955, "kl": 2.6197225730866194, "learning_rate": 6.849470873211522e-07, "loss": 0.262, "num_tokens": 16672448.0, "reward": 0.72967529296875, "reward_std": 0.01400065142661333, "rewards//mean": 0.72967529296875, "rewards//std": 0.04469108581542969, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.386, "grad_norm": 2.0354530811309814, "kl": 0.7951034177094698, "learning_rate": 6.846522246665083e-07, "loss": 0.0795, "num_tokens": 16681040.0, "reward": 0.7459716796875, "reward_std": 0.005416989326477051, "rewards//mean": 0.7459716796875, "rewards//std": 0.022423164919018745, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3862, "grad_norm": 1.7759015560150146, "kl": 1.3610915672034025, "learning_rate": 6.843572876339704e-07, "loss": 0.1361, "num_tokens": 16689648.0, "reward": 0.75347900390625, "reward_std": 0.00892785657197237, "rewards//mean": 0.75347900390625, "rewards//std": 0.02415122464299202, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3864, "grad_norm": 2.468689203262329, "kl": 0.974500609561801, "learning_rate": 6.840622763423391e-07, "loss": 0.0975, "num_tokens": 16698256.0, "reward": 0.74560546875, "reward_std": 0.007358761504292488, "rewards//mean": 0.74560546875, "rewards//std": 0.031616006046533585, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3866, "grad_norm": 1.4950695037841797, "kl": 1.2004152946174145, "learning_rate": 6.837671909104447e-07, "loss": 0.12, "num_tokens": 16706920.0, "reward": 0.750732421875, "reward_std": 0.0077482243068516254, "rewards//mean": 0.750732421875, "rewards//std": 0.03942586109042168, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3868, "grad_norm": 10.273947715759277, "kl": 2.033162873238325, "learning_rate": 6.834720314571479e-07, "loss": 0.2033, "num_tokens": 16715640.0, "reward": 0.7603759765625, "reward_std": 0.008452069014310837, "rewards//mean": 0.7603759765625, "rewards//std": 0.03224346786737442, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.387, "grad_norm": 3.9013893604278564, "kl": 1.0506266802549362, "learning_rate": 6.831767981013388e-07, "loss": 0.1051, "num_tokens": 16724336.0, "reward": 0.74005126953125, "reward_std": 0.0036983320023864508, "rewards//mean": 0.74005126953125, "rewards//std": 0.025901123881340027, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3872, "grad_norm": 8.675947189331055, "kl": 1.2521464210003614, "learning_rate": 6.828814909619372e-07, "loss": 0.1252, "num_tokens": 16733000.0, "reward": 0.7713623046875, "reward_std": 0.0072280410677194595, "rewards//mean": 0.7713623046875, "rewards//std": 0.02669356018304825, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3874, "grad_norm": 4.412496089935303, "kl": 1.6320644859224558, "learning_rate": 6.82586110157893e-07, "loss": 0.1632, "num_tokens": 16741664.0, "reward": 0.73004150390625, "reward_std": 0.011390022933483124, "rewards//mean": 0.73004150390625, "rewards//std": 0.03067202866077423, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3876, "grad_norm": 3.2514443397521973, "kl": 1.6396849621087313, "learning_rate": 6.822906558081856e-07, "loss": 0.164, "num_tokens": 16750360.0, "reward": 0.7725830078125, "reward_std": 0.01013142429292202, "rewards//mean": 0.7725830078125, "rewards//std": 0.029243865981698036, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3878, "grad_norm": 2.7781484127044678, "kl": 1.7112006973475218, "learning_rate": 6.819951280318236e-07, "loss": 0.1711, "num_tokens": 16758984.0, "reward": 0.7593994140625, "reward_std": 0.012383747845888138, "rewards//mean": 0.7593994140625, "rewards//std": 0.032827842980623245, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.388, "grad_norm": 1.5462831258773804, "kl": 0.8432313669472933, "learning_rate": 6.816995269478459e-07, "loss": 0.0843, "num_tokens": 16767672.0, "reward": 0.76971435546875, "reward_std": 0.00866425596177578, "rewards//mean": 0.76971435546875, "rewards//std": 0.030357016250491142, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3882, "grad_norm": 2.730386257171631, "kl": 1.5600259955972433, "learning_rate": 6.814038526753204e-07, "loss": 0.156, "num_tokens": 16776352.0, "reward": 0.7388916015625, "reward_std": 0.009221317246556282, "rewards//mean": 0.7388916015625, "rewards//std": 0.034787241369485855, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3884, "grad_norm": 8.104711532592773, "kl": 1.7729483786970377, "learning_rate": 6.811081053333449e-07, "loss": 0.1773, "num_tokens": 16784968.0, "reward": 0.72039794921875, "reward_std": 0.00957479514181614, "rewards//mean": 0.72039794921875, "rewards//std": 0.04459682106971741, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3886, "grad_norm": 2.123380422592163, "kl": 1.1190021578222513, "learning_rate": 6.80812285041046e-07, "loss": 0.1119, "num_tokens": 16793592.0, "reward": 0.781494140625, "reward_std": 0.0060582393780350685, "rewards//mean": 0.781494140625, "rewards//std": 0.017851572483778, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3888, "grad_norm": 1.211398959159851, "kl": 1.0127136074006557, "learning_rate": 6.805163919175806e-07, "loss": 0.1013, "num_tokens": 16802216.0, "reward": 0.7286376953125, "reward_std": 0.006353127770125866, "rewards//mean": 0.7286376953125, "rewards//std": 0.03037726692855358, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.389, "grad_norm": 9.318678855895996, "kl": 0.8566135875880718, "learning_rate": 6.80220426082134e-07, "loss": 0.0857, "num_tokens": 16810928.0, "reward": 0.77288818359375, "reward_std": 0.007881369441747665, "rewards//mean": 0.77288818359375, "rewards//std": 0.020432662218809128, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3892, "grad_norm": 2.9845526218414307, "kl": 1.3708241041749716, "learning_rate": 6.799243876539213e-07, "loss": 0.1371, "num_tokens": 16819560.0, "reward": 0.73297119140625, "reward_std": 0.010097953490912914, "rewards//mean": 0.73297119140625, "rewards//std": 0.03179832175374031, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3894, "grad_norm": 3.9203383922576904, "kl": 0.6361210681498051, "learning_rate": 6.796282767521869e-07, "loss": 0.0636, "num_tokens": 16828184.0, "reward": 0.740966796875, "reward_std": 0.001035800902172923, "rewards//mean": 0.740966796875, "rewards//std": 0.026354841887950897, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3896, "grad_norm": 3.9711384773254395, "kl": 2.2377002704888582, "learning_rate": 6.793320934962038e-07, "loss": 0.2238, "num_tokens": 16836816.0, "reward": 0.7701416015625, "reward_std": 0.015632741153240204, "rewards//mean": 0.7701416015625, "rewards//std": 0.03237839788198471, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3898, "grad_norm": 4.52419900894165, "kl": 2.323673529550433, "learning_rate": 6.790358380052751e-07, "loss": 0.2324, "num_tokens": 16845448.0, "reward": 0.71630859375, "reward_std": 0.014104368165135384, "rewards//mean": 0.71630859375, "rewards//std": 0.03141619265079498, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.39, "grad_norm": 2.7977373600006104, "kl": 1.09340226277709, "learning_rate": 6.787395103987322e-07, "loss": 0.1093, "num_tokens": 16854072.0, "reward": 0.72857666015625, "reward_std": 0.005173890385776758, "rewards//mean": 0.72857666015625, "rewards//std": 0.023083388805389404, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3902, "grad_norm": 5.4792633056640625, "kl": 2.648605877533555, "learning_rate": 6.784431107959358e-07, "loss": 0.2649, "num_tokens": 16862752.0, "reward": 0.75103759765625, "reward_std": 0.015550365671515465, "rewards//mean": 0.75103759765625, "rewards//std": 0.04376595467329025, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3904, "grad_norm": 1.7156985998153687, "kl": 1.6892276592552662, "learning_rate": 6.781466393162761e-07, "loss": 0.1689, "num_tokens": 16871304.0, "reward": 0.7227783203125, "reward_std": 0.008895116858184338, "rewards//mean": 0.7227783203125, "rewards//std": 0.04620348662137985, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3906, "grad_norm": 4.42630672454834, "kl": 2.740915870293975, "learning_rate": 6.778500960791708e-07, "loss": 0.2741, "num_tokens": 16879952.0, "reward": 0.74871826171875, "reward_std": 0.016002818942070007, "rewards//mean": 0.74871826171875, "rewards//std": 0.04002445936203003, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3908, "grad_norm": 4.884814739227295, "kl": 1.6524646487087011, "learning_rate": 6.775534812040686e-07, "loss": 0.1652, "num_tokens": 16888640.0, "reward": 0.762451171875, "reward_std": 0.006168714724481106, "rewards//mean": 0.762451171875, "rewards//std": 0.024058358743786812, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.391, "grad_norm": 5.532837867736816, "kl": 1.9937932193279266, "learning_rate": 6.772567948104452e-07, "loss": 0.1994, "num_tokens": 16897224.0, "reward": 0.75537109375, "reward_std": 0.021407444030046463, "rewards//mean": 0.75537109375, "rewards//std": 0.03148550167679787, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3912, "grad_norm": 9.207106590270996, "kl": 1.4597187489271164, "learning_rate": 6.769600370178059e-07, "loss": 0.146, "num_tokens": 16905904.0, "reward": 0.74700927734375, "reward_std": 0.007965510711073875, "rewards//mean": 0.74700927734375, "rewards//std": 0.034030720591545105, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3914, "grad_norm": 4.234192848205566, "kl": 1.5983269568532705, "learning_rate": 6.766632079456851e-07, "loss": 0.1598, "num_tokens": 16914576.0, "reward": 0.75140380859375, "reward_std": 0.015184727497398853, "rewards//mean": 0.75140380859375, "rewards//std": 0.03192754462361336, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3916, "grad_norm": 2.5036401748657227, "kl": 1.0536338668316603, "learning_rate": 6.76366307713645e-07, "loss": 0.1054, "num_tokens": 16923160.0, "reward": 0.71319580078125, "reward_std": 0.005778812803328037, "rewards//mean": 0.71319580078125, "rewards//std": 0.03547382354736328, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3918, "grad_norm": 5.060679912567139, "kl": 1.984919572249055, "learning_rate": 6.760693364412775e-07, "loss": 0.1985, "num_tokens": 16931832.0, "reward": 0.77227783203125, "reward_std": 0.011234838515520096, "rewards//mean": 0.77227783203125, "rewards//std": 0.03165566176176071, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.392, "grad_norm": 9.679033279418945, "kl": 0.9287089873105288, "learning_rate": 6.757722942482022e-07, "loss": 0.0929, "num_tokens": 16940536.0, "reward": 0.7427978515625, "reward_std": 0.0045805806294083595, "rewards//mean": 0.7427978515625, "rewards//std": 0.02992946282029152, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3922, "grad_norm": 6.345630645751953, "kl": 2.2394301649183035, "learning_rate": 6.754751812540679e-07, "loss": 0.2239, "num_tokens": 16949200.0, "reward": 0.7667236328125, "reward_std": 0.013151612132787704, "rewards//mean": 0.7667236328125, "rewards//std": 0.033323951065540314, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3924, "grad_norm": 6.910009860992432, "kl": 2.583277940750122, "learning_rate": 6.751779975785514e-07, "loss": 0.2583, "num_tokens": 16957848.0, "reward": 0.741455078125, "reward_std": 0.014633771032094955, "rewards//mean": 0.741455078125, "rewards//std": 0.04051057994365692, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3926, "grad_norm": 7.566949367523193, "kl": 1.6424854304641485, "learning_rate": 6.748807433413586e-07, "loss": 0.1642, "num_tokens": 16966464.0, "reward": 0.78973388671875, "reward_std": 0.01055777445435524, "rewards//mean": 0.78973388671875, "rewards//std": 0.029020102694630623, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3928, "grad_norm": 15.303129196166992, "kl": 3.1461045145988464, "learning_rate": 6.745834186622231e-07, "loss": 0.3146, "num_tokens": 16975304.0, "reward": 0.73602294921875, "reward_std": 0.013409584760665894, "rewards//mean": 0.73602294921875, "rewards//std": 0.03864988312125206, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.393, "grad_norm": 5.361306190490723, "kl": 1.765540674328804, "learning_rate": 6.742860236609076e-07, "loss": 0.1766, "num_tokens": 16983960.0, "reward": 0.739990234375, "reward_std": 0.014041764661669731, "rewards//mean": 0.739990234375, "rewards//std": 0.03307156264781952, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3932, "grad_norm": 4.571110725402832, "kl": 1.7866956647485495, "learning_rate": 6.739885584572025e-07, "loss": 0.1787, "num_tokens": 16992616.0, "reward": 0.7611083984375, "reward_std": 0.009677095338702202, "rewards//mean": 0.7611083984375, "rewards//std": 0.02764080837368965, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3934, "grad_norm": 9.737143516540527, "kl": 1.4153119344264269, "learning_rate": 6.73691023170927e-07, "loss": 0.1415, "num_tokens": 17001224.0, "reward": 0.74267578125, "reward_std": 0.009522315114736557, "rewards//mean": 0.74267578125, "rewards//std": 0.044169504195451736, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3936, "grad_norm": 8.495573997497559, "kl": 1.613790376111865, "learning_rate": 6.733934179219281e-07, "loss": 0.1614, "num_tokens": 17009864.0, "reward": 0.76177978515625, "reward_std": 0.013794094324111938, "rewards//mean": 0.76177978515625, "rewards//std": 0.027067698538303375, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3938, "grad_norm": 3.696143388748169, "kl": 1.8864859715104103, "learning_rate": 6.730957428300811e-07, "loss": 0.1886, "num_tokens": 17018376.0, "reward": 0.7576904296875, "reward_std": 0.014782344922423363, "rewards//mean": 0.7576904296875, "rewards//std": 0.03793619945645332, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.394, "grad_norm": 3.1203043460845947, "kl": 0.6857241913676262, "learning_rate": 6.727979980152898e-07, "loss": 0.0686, "num_tokens": 17026976.0, "reward": 0.77105712890625, "reward_std": 0.005191301926970482, "rewards//mean": 0.77105712890625, "rewards//std": 0.02950071543455124, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3942, "grad_norm": 2.8411827087402344, "kl": 0.9823378846049309, "learning_rate": 6.725001835974852e-07, "loss": 0.0982, "num_tokens": 17035568.0, "reward": 0.76788330078125, "reward_std": 0.005136963911354542, "rewards//mean": 0.76788330078125, "rewards//std": 0.025159969925880432, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3944, "grad_norm": 3.5230236053466797, "kl": 0.899571318179369, "learning_rate": 6.722022996966277e-07, "loss": 0.09, "num_tokens": 17044176.0, "reward": 0.7330322265625, "reward_std": 0.003262493060901761, "rewards//mean": 0.7330322265625, "rewards//std": 0.02540343999862671, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3946, "grad_norm": 5.399919033050537, "kl": 1.9057795070111752, "learning_rate": 6.719043464327042e-07, "loss": 0.1906, "num_tokens": 17052752.0, "reward": 0.75933837890625, "reward_std": 0.01613207533955574, "rewards//mean": 0.75933837890625, "rewards//std": 0.03235892951488495, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3948, "grad_norm": 5.980194568634033, "kl": 1.6304620802402496, "learning_rate": 6.716063239257306e-07, "loss": 0.163, "num_tokens": 17061416.0, "reward": 0.75360107421875, "reward_std": 0.010458397679030895, "rewards//mean": 0.75360107421875, "rewards//std": 0.040333397686481476, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.395, "grad_norm": 8.40462589263916, "kl": 2.120277812704444, "learning_rate": 6.713082322957502e-07, "loss": 0.212, "num_tokens": 17070008.0, "reward": 0.7569580078125, "reward_std": 0.01008752454072237, "rewards//mean": 0.7569580078125, "rewards//std": 0.028717348352074623, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3952, "grad_norm": 17.99199867248535, "kl": 2.275618724524975, "learning_rate": 6.710100716628344e-07, "loss": 0.2276, "num_tokens": 17078680.0, "reward": 0.75274658203125, "reward_std": 0.013096587732434273, "rewards//mean": 0.75274658203125, "rewards//std": 0.03835975006222725, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3954, "grad_norm": 10.808684349060059, "kl": 2.4570872634649277, "learning_rate": 6.70711842147082e-07, "loss": 0.2457, "num_tokens": 17087232.0, "reward": 0.74688720703125, "reward_std": 0.013904260471463203, "rewards//mean": 0.74688720703125, "rewards//std": 0.03641964867711067, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3956, "grad_norm": 12.57481861114502, "kl": 2.9545410871505737, "learning_rate": 6.704135438686203e-07, "loss": 0.2955, "num_tokens": 17095856.0, "reward": 0.76068115234375, "reward_std": 0.011217588558793068, "rewards//mean": 0.76068115234375, "rewards//std": 0.03128286078572273, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3958, "grad_norm": 16.341278076171875, "kl": 2.0866538248956203, "learning_rate": 6.701151769476032e-07, "loss": 0.2087, "num_tokens": 17104456.0, "reward": 0.75799560546875, "reward_std": 0.009943559765815735, "rewards//mean": 0.75799560546875, "rewards//std": 0.028943846002221107, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.396, "grad_norm": 20.670608520507812, "kl": 2.6539546567946672, "learning_rate": 6.698167415042134e-07, "loss": 0.2654, "num_tokens": 17113160.0, "reward": 0.72332763671875, "reward_std": 0.01024525985121727, "rewards//mean": 0.72332763671875, "rewards//std": 0.034114688634872437, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3962, "grad_norm": 26.590112686157227, "kl": 3.4797907043248415, "learning_rate": 6.695182376586602e-07, "loss": 0.348, "num_tokens": 17121936.0, "reward": 0.74481201171875, "reward_std": 0.020049354061484337, "rewards//mean": 0.74481201171875, "rewards//std": 0.044971343129873276, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3964, "grad_norm": 5.345390796661377, "kl": 1.8074094392359257, "learning_rate": 6.692196655311814e-07, "loss": 0.1807, "num_tokens": 17130496.0, "reward": 0.7347412109375, "reward_std": 0.014416481368243694, "rewards//mean": 0.7347412109375, "rewards//std": 0.0392586775124073, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3966, "grad_norm": 19.076135635375977, "kl": 2.9417112972587347, "learning_rate": 6.689210252420415e-07, "loss": 0.2942, "num_tokens": 17139192.0, "reward": 0.7327880859375, "reward_std": 0.021368755027651787, "rewards//mean": 0.7327880859375, "rewards//std": 0.05111980438232422, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3968, "grad_norm": 21.99047088623047, "kl": 3.586339859291911, "learning_rate": 6.686223169115327e-07, "loss": 0.3586, "num_tokens": 17147984.0, "reward": 0.7293701171875, "reward_std": 0.010650159791111946, "rewards//mean": 0.7293701171875, "rewards//std": 0.03817486763000488, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.397, "grad_norm": 7.656482219696045, "kl": 0.8482975848019123, "learning_rate": 6.683235406599749e-07, "loss": 0.0848, "num_tokens": 17156640.0, "reward": 0.7391357421875, "reward_std": 0.005536393262445927, "rewards//mean": 0.7391357421875, "rewards//std": 0.035973262041807175, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3972, "grad_norm": 8.579667091369629, "kl": 2.4071076661348343, "learning_rate": 6.68024696607715e-07, "loss": 0.2407, "num_tokens": 17165376.0, "reward": 0.76629638671875, "reward_std": 0.0118489945307374, "rewards//mean": 0.76629638671875, "rewards//std": 0.02634270489215851, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3974, "grad_norm": 3.7696523666381836, "kl": 1.5843081548810005, "learning_rate": 6.677257848751276e-07, "loss": 0.1584, "num_tokens": 17174008.0, "reward": 0.78240966796875, "reward_std": 0.009303020313382149, "rewards//mean": 0.78240966796875, "rewards//std": 0.024741342291235924, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3976, "grad_norm": 2.2588491439819336, "kl": 0.9959504008293152, "learning_rate": 6.674268055826138e-07, "loss": 0.0996, "num_tokens": 17182720.0, "reward": 0.76434326171875, "reward_std": 0.006129647605121136, "rewards//mean": 0.76434326171875, "rewards//std": 0.02718878537416458, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3978, "grad_norm": 5.689353942871094, "kl": 0.9194694440811872, "learning_rate": 6.671277588506029e-07, "loss": 0.0919, "num_tokens": 17191400.0, "reward": 0.7371826171875, "reward_std": 0.0032536799553781748, "rewards//mean": 0.7371826171875, "rewards//std": 0.03268550708889961, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.398, "grad_norm": 2.835458993911743, "kl": 0.6638965383172035, "learning_rate": 6.668286447995507e-07, "loss": 0.0664, "num_tokens": 17200032.0, "reward": 0.748779296875, "reward_std": 0.0030498532578349113, "rewards//mean": 0.748779296875, "rewards//std": 0.025514332577586174, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3982, "grad_norm": 4.356527328491211, "kl": 1.531398318707943, "learning_rate": 6.665294635499403e-07, "loss": 0.1531, "num_tokens": 17208680.0, "reward": 0.77423095703125, "reward_std": 0.01328960806131363, "rewards//mean": 0.77423095703125, "rewards//std": 0.027447011321783066, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3984, "grad_norm": 6.774599075317383, "kl": 0.9724132753908634, "learning_rate": 6.66230215222282e-07, "loss": 0.0972, "num_tokens": 17217296.0, "reward": 0.76702880859375, "reward_std": 0.009563660249114037, "rewards//mean": 0.76702880859375, "rewards//std": 0.03426919877529144, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3986, "grad_norm": 2.9007930755615234, "kl": 1.5415233988314867, "learning_rate": 6.659308999371129e-07, "loss": 0.1542, "num_tokens": 17225960.0, "reward": 0.7188720703125, "reward_std": 0.012013616971671581, "rewards//mean": 0.7188720703125, "rewards//std": 0.03355933353304863, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3988, "grad_norm": 3.7563085556030273, "kl": 1.1901140250265598, "learning_rate": 6.65631517814997e-07, "loss": 0.119, "num_tokens": 17234600.0, "reward": 0.74310302734375, "reward_std": 0.008727531880140305, "rewards//mean": 0.74310302734375, "rewards//std": 0.031702011823654175, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.399, "grad_norm": 2.4339611530303955, "kl": 1.5453631542623043, "learning_rate": 6.653320689765256e-07, "loss": 0.1545, "num_tokens": 17243224.0, "reward": 0.76031494140625, "reward_std": 0.01022228505462408, "rewards//mean": 0.76031494140625, "rewards//std": 0.026100818067789078, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3992, "grad_norm": 3.2661845684051514, "kl": 1.124980976805091, "learning_rate": 6.650325535423166e-07, "loss": 0.1125, "num_tokens": 17251840.0, "reward": 0.7269287109375, "reward_std": 0.007605080492794514, "rewards//mean": 0.7269287109375, "rewards//std": 0.027539854869246483, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3994, "grad_norm": 2.4316375255584717, "kl": 0.7848870139569044, "learning_rate": 6.647329716330147e-07, "loss": 0.0785, "num_tokens": 17260456.0, "reward": 0.73541259765625, "reward_std": 0.007167475763708353, "rewards//mean": 0.73541259765625, "rewards//std": 0.0317058339715004, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3996, "grad_norm": 3.2776546478271484, "kl": 1.6006934456527233, "learning_rate": 6.644333233692916e-07, "loss": 0.1601, "num_tokens": 17269256.0, "reward": 0.7718505859375, "reward_std": 0.013405588455498219, "rewards//mean": 0.7718505859375, "rewards//std": 0.029312115162611008, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.3998, "grad_norm": 2.0470504760742188, "kl": 0.7851455882191658, "learning_rate": 6.641336088718456e-07, "loss": 0.0785, "num_tokens": 17277904.0, "reward": 0.7802734375, "reward_std": 0.007900599390268326, "rewards//mean": 0.7802734375, "rewards//std": 0.02856115810573101, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 128.0, "epoch": 0.4, "grad_norm": 2.0166735649108887, "kl": 1.69225930608809, "learning_rate": 6.638338282614014e-07, "loss": 0.1692, "num_tokens": 17286544.0, "reward": 0.71392822265625, "reward_std": 0.01248107198625803, "rewards//mean": 0.71392822265625, "rewards//std": 0.03881208226084709, "step": 2000 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }