diff --git "a/checkpoint-4000/trainer_state.json" "b/checkpoint-4000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4000/trainer_state.json" @@ -0,0 +1,76034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0002, + "grad_norm": 3.0746591091156006, + "kl": 0.0001370312529616058, + "learning_rate": 0.0, + "loss": 0.0, + "num_tokens": 8600.0, + "reward": 0.700439453125, + "reward_std": 0.014704298228025436, + "rewards//mean": 0.700439453125, + "rewards//std": 0.04464063048362732, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0004, + "grad_norm": 2.9869141578674316, + "kl": 6.267779826885089e-05, + "learning_rate": 2e-08, + "loss": 0.0, + "num_tokens": 17200.0, + "reward": 0.73077392578125, + "reward_std": 0.015316192060709, + "rewards//mean": 0.73077392578125, + "rewards//std": 0.05491425469517708, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0006, + "grad_norm": 3.3335037231445312, + "kl": 0.0005714244398404844, + "learning_rate": 4e-08, + "loss": 0.0001, + "num_tokens": 25872.0, + "reward": 0.742431640625, + "reward_std": 0.012949886731803417, + "rewards//mean": 0.742431640625, + "rewards//std": 0.04560147598385811, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0008, + "grad_norm": 3.0980048179626465, + "kl": 0.0005316020688042045, + "learning_rate": 6e-08, + "loss": 0.0001, + "num_tokens": 34600.0, + "reward": 0.715576171875, + "reward_std": 0.015215152874588966, + "rewards//mean": 0.715576171875, + "rewards//std": 0.050897374749183655, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.001, + "grad_norm": 3.049089193344116, + "kl": 0.000517527550982777, + "learning_rate": 8e-08, + "loss": 0.0001, + "num_tokens": 43304.0, + "reward": 0.71856689453125, + "reward_std": 0.01438464131206274, + "rewards//mean": 0.71856689453125, + "rewards//std": 0.05342591553926468, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0012, + "grad_norm": 2.9623756408691406, + "kl": 0.0005474825884448364, + "learning_rate": 1e-07, + "loss": 0.0001, + "num_tokens": 51992.0, + "reward": 0.713134765625, + "reward_std": 0.012754758819937706, + "rewards//mean": 0.713134765625, + "rewards//std": 0.05878513306379318, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0014, + "grad_norm": 3.1190226078033447, + "kl": 0.0005763711305917241, + "learning_rate": 1.2e-07, + "loss": 0.0001, + "num_tokens": 60696.0, + "reward": 0.7222900390625, + "reward_std": 0.014144840650260448, + "rewards//mean": 0.7222900390625, + "rewards//std": 0.03965155407786369, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0016, + "grad_norm": 2.9739928245544434, + "kl": 0.0005803547683171928, + "learning_rate": 1.4e-07, + "loss": 0.0001, + "num_tokens": 69336.0, + "reward": 0.732421875, + "reward_std": 0.01358483824878931, + "rewards//mean": 0.732421875, + "rewards//std": 0.05295494943857193, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0018, + "grad_norm": 3.1628496646881104, + "kl": 0.0005657363144564442, + "learning_rate": 1.6e-07, + "loss": 0.0001, + "num_tokens": 78008.0, + "reward": 0.70953369140625, + "reward_std": 0.013684568926692009, + "rewards//mean": 0.70953369140625, + "rewards//std": 0.05245111510157585, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.002, + "grad_norm": 3.1755638122558594, + "kl": 0.0005453915146063082, + "learning_rate": 1.8e-07, + "loss": 0.0001, + "num_tokens": 86648.0, + "reward": 0.68017578125, + "reward_std": 0.014631778001785278, + "rewards//mean": 0.68017578125, + "rewards//std": 0.04801594093441963, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0022, + "grad_norm": 3.0773203372955322, + "kl": 0.0005841563324793242, + "learning_rate": 2e-07, + "loss": 0.0001, + "num_tokens": 95456.0, + "reward": 0.69476318359375, + "reward_std": 0.012789730913937092, + "rewards//mean": 0.69476318359375, + "rewards//std": 0.043625302612781525, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0024, + "grad_norm": 3.3067240715026855, + "kl": 0.0005825260886922479, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0001, + "num_tokens": 104112.0, + "reward": 0.707763671875, + "reward_std": 0.014192605391144753, + "rewards//mean": 0.707763671875, + "rewards//std": 0.04431390017271042, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0026, + "grad_norm": 3.052741527557373, + "kl": 0.0005862039251951501, + "learning_rate": 2.4e-07, + "loss": 0.0001, + "num_tokens": 112728.0, + "reward": 0.7296142578125, + "reward_std": 0.01562296599149704, + "rewards//mean": 0.7296142578125, + "rewards//std": 0.04462214559316635, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0028, + "grad_norm": 3.496222496032715, + "kl": 0.000546930474229157, + "learning_rate": 2.6e-07, + "loss": 0.0001, + "num_tokens": 121352.0, + "reward": 0.75439453125, + "reward_std": 0.020952299237251282, + "rewards//mean": 0.75439453125, + "rewards//std": 0.04351207613945007, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.003, + "grad_norm": 3.0683610439300537, + "kl": 0.0005508811809704639, + "learning_rate": 2.8e-07, + "loss": 0.0001, + "num_tokens": 130072.0, + "reward": 0.72918701171875, + "reward_std": 0.02001885510981083, + "rewards//mean": 0.72918701171875, + "rewards//std": 0.059948600828647614, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0032, + "grad_norm": 3.168755292892456, + "kl": 0.0005511691051651724, + "learning_rate": 3e-07, + "loss": 0.0001, + "num_tokens": 138680.0, + "reward": 0.731689453125, + "reward_std": 0.015139667317271233, + "rewards//mean": 0.731689453125, + "rewards//std": 0.04729103669524193, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0034, + "grad_norm": 3.126718044281006, + "kl": 0.000573088698729407, + "learning_rate": 3.2e-07, + "loss": 0.0001, + "num_tokens": 147320.0, + "reward": 0.68780517578125, + "reward_std": 0.01579727604985237, + "rewards//mean": 0.68780517578125, + "rewards//std": 0.058778662234544754, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0036, + "grad_norm": 3.0903584957122803, + "kl": 0.0005722851856262423, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0001, + "num_tokens": 155984.0, + "reward": 0.71990966796875, + "reward_std": 0.012143155559897423, + "rewards//mean": 0.71990966796875, + "rewards//std": 0.047596342861652374, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0038, + "grad_norm": 3.2795073986053467, + "kl": 0.000557706574909389, + "learning_rate": 3.6e-07, + "loss": 0.0001, + "num_tokens": 164608.0, + "reward": 0.702392578125, + "reward_std": 0.021895065903663635, + "rewards//mean": 0.702392578125, + "rewards//std": 0.06125873699784279, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.004, + "grad_norm": 3.1292951107025146, + "kl": 0.0005708587632398121, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0001, + "num_tokens": 173168.0, + "reward": 0.72113037109375, + "reward_std": 0.0165967158973217, + "rewards//mean": 0.72113037109375, + "rewards//std": 0.05302632972598076, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0042, + "grad_norm": 3.0912601947784424, + "kl": 0.0005462863045977429, + "learning_rate": 4e-07, + "loss": 0.0001, + "num_tokens": 181792.0, + "reward": 0.72027587890625, + "reward_std": 0.015079968608915806, + "rewards//mean": 0.72027587890625, + "rewards//std": 0.0568997748196125, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0044, + "grad_norm": 3.1044301986694336, + "kl": 0.0005402523092925549, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0001, + "num_tokens": 190392.0, + "reward": 0.73260498046875, + "reward_std": 0.01755766198039055, + "rewards//mean": 0.73260498046875, + "rewards//std": 0.03580089658498764, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0046, + "grad_norm": 3.1160430908203125, + "kl": 0.0005472921475302428, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0001, + "num_tokens": 199080.0, + "reward": 0.69964599609375, + "reward_std": 0.015488953329622746, + "rewards//mean": 0.69964599609375, + "rewards//std": 0.04417289048433304, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0048, + "grad_norm": 3.093228340148926, + "kl": 0.000571258133277297, + "learning_rate": 4.6e-07, + "loss": 0.0001, + "num_tokens": 207720.0, + "reward": 0.72613525390625, + "reward_std": 0.015438448637723923, + "rewards//mean": 0.72613525390625, + "rewards//std": 0.05386695638298988, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.005, + "grad_norm": 3.132035493850708, + "kl": 0.0005702183116227388, + "learning_rate": 4.8e-07, + "loss": 0.0001, + "num_tokens": 216296.0, + "reward": 0.708251953125, + "reward_std": 0.01361482311040163, + "rewards//mean": 0.708251953125, + "rewards//std": 0.0540633462369442, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0052, + "grad_norm": 3.058742046356201, + "kl": 0.0005953195868642069, + "learning_rate": 5e-07, + "loss": 0.0001, + "num_tokens": 224968.0, + "reward": 0.75274658203125, + "reward_std": 0.021526148542761803, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.048328571021556854, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0054, + "grad_norm": 3.0073442459106445, + "kl": 0.0005892160552321002, + "learning_rate": 5.2e-07, + "loss": 0.0001, + "num_tokens": 233528.0, + "reward": 0.67626953125, + "reward_std": 0.01784338429570198, + "rewards//mean": 0.67626953125, + "rewards//std": 0.06989618390798569, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0056, + "grad_norm": 3.2221901416778564, + "kl": 0.0005890742468181998, + "learning_rate": 5.4e-07, + "loss": 0.0001, + "num_tokens": 242280.0, + "reward": 0.68499755859375, + "reward_std": 0.014033805578947067, + "rewards//mean": 0.68499755859375, + "rewards//std": 0.04582660645246506, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0058, + "grad_norm": 3.031466484069824, + "kl": 0.000607152069278527, + "learning_rate": 5.6e-07, + "loss": 0.0001, + "num_tokens": 250976.0, + "reward": 0.712646484375, + "reward_std": 0.016498101875185966, + "rewards//mean": 0.712646484375, + "rewards//std": 0.04970809444785118, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.006, + "grad_norm": 3.025085687637329, + "kl": 0.0005704485083697364, + "learning_rate": 5.8e-07, + "loss": 0.0001, + "num_tokens": 259632.0, + "reward": 0.70501708984375, + "reward_std": 0.015339357778429985, + "rewards//mean": 0.70501708984375, + "rewards//std": 0.045702897012233734, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0062, + "grad_norm": 3.0791728496551514, + "kl": 0.0006308649681159295, + "learning_rate": 6e-07, + "loss": 0.0001, + "num_tokens": 268208.0, + "reward": 0.739013671875, + "reward_std": 0.017989136278629303, + "rewards//mean": 0.739013671875, + "rewards//std": 0.0432741641998291, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0064, + "grad_norm": 3.1217832565307617, + "kl": 0.0006620676867896691, + "learning_rate": 6.2e-07, + "loss": 0.0001, + "num_tokens": 276824.0, + "reward": 0.70941162109375, + "reward_std": 0.016736187040805817, + "rewards//mean": 0.70941162109375, + "rewards//std": 0.05383322015404701, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0066, + "grad_norm": 3.2112765312194824, + "kl": 0.0005905148573219776, + "learning_rate": 6.4e-07, + "loss": 0.0001, + "num_tokens": 285520.0, + "reward": 0.73883056640625, + "reward_std": 0.017011437565088272, + "rewards//mean": 0.73883056640625, + "rewards//std": 0.04840962961316109, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0068, + "grad_norm": 3.0612096786499023, + "kl": 0.0005876668219571002, + "learning_rate": 6.6e-07, + "loss": 0.0001, + "num_tokens": 294144.0, + "reward": 0.70819091796875, + "reward_std": 0.017199836671352386, + "rewards//mean": 0.70819091796875, + "rewards//std": 0.04584411159157753, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.007, + "grad_norm": 3.0824496746063232, + "kl": 0.0006361504347296432, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0001, + "num_tokens": 302888.0, + "reward": 0.735107421875, + "reward_std": 0.011731683276593685, + "rewards//mean": 0.735107421875, + "rewards//std": 0.037343598902225494, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0072, + "grad_norm": 3.2275145053863525, + "kl": 0.0007240941995405592, + "learning_rate": 7e-07, + "loss": 0.0001, + "num_tokens": 311504.0, + "reward": 0.71002197265625, + "reward_std": 0.01906929537653923, + "rewards//mean": 0.71002197265625, + "rewards//std": 0.06062629818916321, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0074, + "grad_norm": 3.2091662883758545, + "kl": 0.000766696102800779, + "learning_rate": 7.2e-07, + "loss": 0.0001, + "num_tokens": 320208.0, + "reward": 0.72698974609375, + "reward_std": 0.015764687210321426, + "rewards//mean": 0.72698974609375, + "rewards//std": 0.04941116273403168, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0076, + "grad_norm": 3.056762933731079, + "kl": 0.0006720083692925982, + "learning_rate": 7.4e-07, + "loss": 0.0001, + "num_tokens": 328904.0, + "reward": 0.7076416015625, + "reward_std": 0.013192662969231606, + "rewards//mean": 0.7076416015625, + "rewards//std": 0.03601868078112602, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0078, + "grad_norm": 3.024362564086914, + "kl": 0.0007397676017717458, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0001, + "num_tokens": 337632.0, + "reward": 0.7073974609375, + "reward_std": 0.016517726704478264, + "rewards//mean": 0.7073974609375, + "rewards//std": 0.06210627406835556, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.008, + "grad_norm": 3.030052423477173, + "kl": 0.000783703027991578, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0001, + "num_tokens": 346264.0, + "reward": 0.6876220703125, + "reward_std": 0.01800483465194702, + "rewards//mean": 0.6876220703125, + "rewards//std": 0.04799812287092209, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0082, + "grad_norm": 3.3742542266845703, + "kl": 0.0007968554928083904, + "learning_rate": 8e-07, + "loss": 0.0001, + "num_tokens": 354912.0, + "reward": 0.72332763671875, + "reward_std": 0.015946194529533386, + "rewards//mean": 0.72332763671875, + "rewards//std": 0.05200214684009552, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0084, + "grad_norm": 2.983952522277832, + "kl": 0.0008011522004380822, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0001, + "num_tokens": 363544.0, + "reward": 0.7408447265625, + "reward_std": 0.013474998995661736, + "rewards//mean": 0.7408447265625, + "rewards//std": 0.04879509657621384, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0086, + "grad_norm": 3.248126268386841, + "kl": 0.0008595763283665292, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0001, + "num_tokens": 372144.0, + "reward": 0.73944091796875, + "reward_std": 0.016470063477754593, + "rewards//mean": 0.73944091796875, + "rewards//std": 0.047130998224020004, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0088, + "grad_norm": 3.0845632553100586, + "kl": 0.0008815952387521975, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0001, + "num_tokens": 380848.0, + "reward": 0.73309326171875, + "reward_std": 0.01539240125566721, + "rewards//mean": 0.73309326171875, + "rewards//std": 0.04680288955569267, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.009, + "grad_norm": 3.2412116527557373, + "kl": 0.000932343871681951, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0001, + "num_tokens": 389504.0, + "reward": 0.6689453125, + "reward_std": 0.012580599635839462, + "rewards//mean": 0.6689453125, + "rewards//std": 0.04404045641422272, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0092, + "grad_norm": 3.1200408935546875, + "kl": 0.0009622859070077538, + "learning_rate": 9e-07, + "loss": 0.0001, + "num_tokens": 398200.0, + "reward": 0.72552490234375, + "reward_std": 0.015103422105312347, + "rewards//mean": 0.72552490234375, + "rewards//std": 0.06293346732854843, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0094, + "grad_norm": 3.1873373985290527, + "kl": 0.001048956903105136, + "learning_rate": 9.2e-07, + "loss": 0.0001, + "num_tokens": 406848.0, + "reward": 0.701416015625, + "reward_std": 0.0156770758330822, + "rewards//mean": 0.701416015625, + "rewards//std": 0.0535547137260437, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0096, + "grad_norm": 3.0196340084075928, + "kl": 0.0011119443515781313, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0001, + "num_tokens": 415520.0, + "reward": 0.70697021484375, + "reward_std": 0.016891758888959885, + "rewards//mean": 0.70697021484375, + "rewards//std": 0.057861633598804474, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0098, + "grad_norm": 2.955000400543213, + "kl": 0.0010805678321048617, + "learning_rate": 9.6e-07, + "loss": 0.0001, + "num_tokens": 424088.0, + "reward": 0.71795654296875, + "reward_std": 0.016512058675289154, + "rewards//mean": 0.71795654296875, + "rewards//std": 0.05882345885038376, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.01, + "grad_norm": 3.1035845279693604, + "kl": 0.001265802318812348, + "learning_rate": 9.8e-07, + "loss": 0.0001, + "num_tokens": 432696.0, + "reward": 0.7044677734375, + "reward_std": 0.014481520280241966, + "rewards//mean": 0.7044677734375, + "rewards//std": 0.049777936190366745, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0102, + "grad_norm": 3.059105396270752, + "kl": 0.0014140766143100336, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 441352.0, + "reward": 0.7259521484375, + "reward_std": 0.014453758485615253, + "rewards//mean": 0.7259521484375, + "rewards//std": 0.03619309142231941, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0104, + "grad_norm": 3.0305216312408447, + "kl": 0.001641553535591811, + "learning_rate": 9.999998993000298e-07, + "loss": 0.0002, + "num_tokens": 449960.0, + "reward": 0.74005126953125, + "reward_std": 0.015035301446914673, + "rewards//mean": 0.74005126953125, + "rewards//std": 0.049174390733242035, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0106, + "grad_norm": 3.1068880558013916, + "kl": 0.0014859264192637056, + "learning_rate": 9.999995972001601e-07, + "loss": 0.0001, + "num_tokens": 458512.0, + "reward": 0.7259521484375, + "reward_std": 0.019076917320489883, + "rewards//mean": 0.7259521484375, + "rewards//std": 0.045931410044431686, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0108, + "grad_norm": 3.2462215423583984, + "kl": 0.0017005849367706105, + "learning_rate": 9.999990937005123e-07, + "loss": 0.0002, + "num_tokens": 467048.0, + "reward": 0.72802734375, + "reward_std": 0.01297299936413765, + "rewards//mean": 0.72802734375, + "rewards//std": 0.044377390295267105, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.011, + "grad_norm": 2.8820033073425293, + "kl": 0.0015795445651747286, + "learning_rate": 9.999983888012896e-07, + "loss": 0.0002, + "num_tokens": 475728.0, + "reward": 0.70489501953125, + "reward_std": 0.01869776099920273, + "rewards//mean": 0.70489501953125, + "rewards//std": 0.06755409389734268, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0112, + "grad_norm": 3.0614702701568604, + "kl": 0.0019457548114587553, + "learning_rate": 9.999974825027754e-07, + "loss": 0.0002, + "num_tokens": 484360.0, + "reward": 0.7200927734375, + "reward_std": 0.014355950988829136, + "rewards//mean": 0.7200927734375, + "rewards//std": 0.05347480624914169, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0114, + "grad_norm": 3.1908984184265137, + "kl": 0.0019651364054880105, + "learning_rate": 9.999963748053354e-07, + "loss": 0.0002, + "num_tokens": 493000.0, + "reward": 0.738525390625, + "reward_std": 0.01591881364583969, + "rewards//mean": 0.738525390625, + "rewards//std": 0.04919874295592308, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0116, + "grad_norm": 2.9364521503448486, + "kl": 0.0021245284951874055, + "learning_rate": 9.99995065709415e-07, + "loss": 0.0002, + "num_tokens": 501632.0, + "reward": 0.719970703125, + "reward_std": 0.01409243606030941, + "rewards//mean": 0.719970703125, + "rewards//std": 0.04894702881574631, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0118, + "grad_norm": 3.003506660461426, + "kl": 0.0020506469227257185, + "learning_rate": 9.999935552155421e-07, + "loss": 0.0002, + "num_tokens": 510288.0, + "reward": 0.72265625, + "reward_std": 0.013906879350543022, + "rewards//mean": 0.72265625, + "rewards//std": 0.04887336865067482, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.012, + "grad_norm": 3.354442596435547, + "kl": 0.002489405007509049, + "learning_rate": 9.99991843324325e-07, + "loss": 0.0002, + "num_tokens": 518952.0, + "reward": 0.72845458984375, + "reward_std": 0.0177521500736475, + "rewards//mean": 0.72845458984375, + "rewards//std": 0.04704001545906067, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0122, + "grad_norm": 3.0271553993225098, + "kl": 0.0023055829078657553, + "learning_rate": 9.999899300364532e-07, + "loss": 0.0002, + "num_tokens": 527520.0, + "reward": 0.71478271484375, + "reward_std": 0.013522474095225334, + "rewards//mean": 0.71478271484375, + "rewards//std": 0.04290665686130524, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0124, + "grad_norm": 3.1818554401397705, + "kl": 0.0025641661195550114, + "learning_rate": 9.999878153526972e-07, + "loss": 0.0003, + "num_tokens": 536112.0, + "reward": 0.70977783203125, + "reward_std": 0.01426254864782095, + "rewards//mean": 0.70977783203125, + "rewards//std": 0.051858142018318176, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0126, + "grad_norm": 3.3307740688323975, + "kl": 0.0030574638076359406, + "learning_rate": 9.999854992739093e-07, + "loss": 0.0003, + "num_tokens": 544736.0, + "reward": 0.70355224609375, + "reward_std": 0.013520177453756332, + "rewards//mean": 0.70355224609375, + "rewards//std": 0.04383714869618416, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0128, + "grad_norm": 3.1743972301483154, + "kl": 0.0038234230887610465, + "learning_rate": 9.999829818010219e-07, + "loss": 0.0004, + "num_tokens": 553408.0, + "reward": 0.72265625, + "reward_std": 0.015198908746242523, + "rewards//mean": 0.72265625, + "rewards//std": 0.04554367810487747, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.013, + "grad_norm": 3.0791256427764893, + "kl": 0.0031573468004353344, + "learning_rate": 9.999802629350491e-07, + "loss": 0.0003, + "num_tokens": 562064.0, + "reward": 0.72296142578125, + "reward_std": 0.016398118808865547, + "rewards//mean": 0.72296142578125, + "rewards//std": 0.04709308221936226, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0132, + "grad_norm": 3.233900785446167, + "kl": 0.003446219547186047, + "learning_rate": 9.999773426770863e-07, + "loss": 0.0003, + "num_tokens": 570664.0, + "reward": 0.71954345703125, + "reward_std": 0.015487316995859146, + "rewards//mean": 0.71954345703125, + "rewards//std": 0.04876014590263367, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0134, + "grad_norm": 3.3780455589294434, + "kl": 0.004207952646538615, + "learning_rate": 9.999742210283097e-07, + "loss": 0.0004, + "num_tokens": 579360.0, + "reward": 0.71832275390625, + "reward_std": 0.013939326629042625, + "rewards//mean": 0.71832275390625, + "rewards//std": 0.04669441655278206, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0136, + "grad_norm": 3.114373207092285, + "kl": 0.004350957824499346, + "learning_rate": 9.999708979899767e-07, + "loss": 0.0004, + "num_tokens": 587992.0, + "reward": 0.7491455078125, + "reward_std": 0.013725947588682175, + "rewards//mean": 0.7491455078125, + "rewards//std": 0.03409987688064575, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0138, + "grad_norm": 3.0971555709838867, + "kl": 0.004925672605168074, + "learning_rate": 9.999673735634259e-07, + "loss": 0.0005, + "num_tokens": 596608.0, + "reward": 0.7000732421875, + "reward_std": 0.015324430540204048, + "rewards//mean": 0.7000732421875, + "rewards//std": 0.03915829584002495, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.014, + "grad_norm": 2.964268445968628, + "kl": 0.004543857765384018, + "learning_rate": 9.999636477500764e-07, + "loss": 0.0005, + "num_tokens": 605248.0, + "reward": 0.708251953125, + "reward_std": 0.017837759107351303, + "rewards//mean": 0.708251953125, + "rewards//std": 0.05202692002058029, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0142, + "grad_norm": 3.014002561569214, + "kl": 0.005491413321578875, + "learning_rate": 9.999597205514296e-07, + "loss": 0.0005, + "num_tokens": 613824.0, + "reward": 0.6988525390625, + "reward_std": 0.017433026805520058, + "rewards//mean": 0.6988525390625, + "rewards//std": 0.05124284327030182, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0144, + "grad_norm": 2.9774134159088135, + "kl": 0.0056799468729877844, + "learning_rate": 9.999555919690672e-07, + "loss": 0.0006, + "num_tokens": 622328.0, + "reward": 0.73919677734375, + "reward_std": 0.014505268074572086, + "rewards//mean": 0.73919677734375, + "rewards//std": 0.05622804909944534, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0146, + "grad_norm": 3.0427486896514893, + "kl": 0.005467346069053747, + "learning_rate": 9.99951262004652e-07, + "loss": 0.0005, + "num_tokens": 630976.0, + "reward": 0.6893310546875, + "reward_std": 0.01674201712012291, + "rewards//mean": 0.6893310546875, + "rewards//std": 0.037168681621551514, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0148, + "grad_norm": 3.013369083404541, + "kl": 0.005091004626592621, + "learning_rate": 9.999467306599285e-07, + "loss": 0.0005, + "num_tokens": 639624.0, + "reward": 0.73468017578125, + "reward_std": 0.013643546961247921, + "rewards//mean": 0.73468017578125, + "rewards//std": 0.03605286404490471, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.015, + "grad_norm": 3.043957471847534, + "kl": 0.008441416663117707, + "learning_rate": 9.999419979367214e-07, + "loss": 0.0008, + "num_tokens": 648320.0, + "reward": 0.7056884765625, + "reward_std": 0.015556867234408855, + "rewards//mean": 0.7056884765625, + "rewards//std": 0.05865868926048279, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0152, + "grad_norm": 3.231454849243164, + "kl": 0.008039619016926736, + "learning_rate": 9.999370638369376e-07, + "loss": 0.0008, + "num_tokens": 657080.0, + "reward": 0.7269287109375, + "reward_std": 0.013333200477063656, + "rewards//mean": 0.7269287109375, + "rewards//std": 0.04582317918539047, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0154, + "grad_norm": 3.110569715499878, + "kl": 0.007806680485373363, + "learning_rate": 9.99931928362564e-07, + "loss": 0.0008, + "num_tokens": 665720.0, + "reward": 0.69854736328125, + "reward_std": 0.016080046072602272, + "rewards//mean": 0.69854736328125, + "rewards//std": 0.060083795338869095, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0156, + "grad_norm": 2.921905994415283, + "kl": 0.007719275105046108, + "learning_rate": 9.999265915156696e-07, + "loss": 0.0008, + "num_tokens": 674336.0, + "reward": 0.720458984375, + "reward_std": 0.015469206497073174, + "rewards//mean": 0.720458984375, + "rewards//std": 0.047003354877233505, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0158, + "grad_norm": 3.413708448410034, + "kl": 0.009065819176612422, + "learning_rate": 9.999210532984038e-07, + "loss": 0.0009, + "num_tokens": 682968.0, + "reward": 0.68780517578125, + "reward_std": 0.017976250499486923, + "rewards//mean": 0.68780517578125, + "rewards//std": 0.06774518638849258, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.016, + "grad_norm": 3.190436363220215, + "kl": 0.008012848178623244, + "learning_rate": 9.999153137129977e-07, + "loss": 0.0008, + "num_tokens": 691640.0, + "reward": 0.74041748046875, + "reward_std": 0.017986297607421875, + "rewards//mean": 0.74041748046875, + "rewards//std": 0.057460226118564606, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0162, + "grad_norm": 3.164731502532959, + "kl": 0.007367404759861529, + "learning_rate": 9.999093727617628e-07, + "loss": 0.0007, + "num_tokens": 700264.0, + "reward": 0.7017822265625, + "reward_std": 0.016125842928886414, + "rewards//mean": 0.7017822265625, + "rewards//std": 0.035538043826818466, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0164, + "grad_norm": 2.9602649211883545, + "kl": 0.008762065350310877, + "learning_rate": 9.999032304470924e-07, + "loss": 0.0009, + "num_tokens": 708984.0, + "reward": 0.737548828125, + "reward_std": 0.013302361592650414, + "rewards//mean": 0.737548828125, + "rewards//std": 0.04255741462111473, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0166, + "grad_norm": 3.0777437686920166, + "kl": 0.009679947281256318, + "learning_rate": 9.998968867714608e-07, + "loss": 0.001, + "num_tokens": 717568.0, + "reward": 0.73052978515625, + "reward_std": 0.012443384155631065, + "rewards//mean": 0.73052978515625, + "rewards//std": 0.04523146152496338, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0168, + "grad_norm": 3.295330762863159, + "kl": 0.01055754155095201, + "learning_rate": 9.998903417374226e-07, + "loss": 0.0011, + "num_tokens": 726304.0, + "reward": 0.7119140625, + "reward_std": 0.013127093203365803, + "rewards//mean": 0.7119140625, + "rewards//std": 0.04087439924478531, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.017, + "grad_norm": 3.223742961883545, + "kl": 0.010981887433445081, + "learning_rate": 9.998835953476147e-07, + "loss": 0.0011, + "num_tokens": 735000.0, + "reward": 0.73431396484375, + "reward_std": 0.012129535898566246, + "rewards//mean": 0.73431396484375, + "rewards//std": 0.04945709556341171, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0172, + "grad_norm": 3.227137327194214, + "kl": 0.010623314272379503, + "learning_rate": 9.998766476047545e-07, + "loss": 0.0011, + "num_tokens": 743648.0, + "reward": 0.7049560546875, + "reward_std": 0.017435938119888306, + "rewards//mean": 0.7049560546875, + "rewards//std": 0.06491030752658844, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0174, + "grad_norm": 3.1377432346343994, + "kl": 0.011226685048313811, + "learning_rate": 9.998694985116404e-07, + "loss": 0.0011, + "num_tokens": 752416.0, + "reward": 0.71868896484375, + "reward_std": 0.011546581983566284, + "rewards//mean": 0.71868896484375, + "rewards//std": 0.06741254776716232, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0176, + "grad_norm": 2.9976580142974854, + "kl": 0.011882514314493164, + "learning_rate": 9.99862148071152e-07, + "loss": 0.0012, + "num_tokens": 761040.0, + "reward": 0.73583984375, + "reward_std": 0.013298182748258114, + "rewards//mean": 0.73583984375, + "rewards//std": 0.05867636576294899, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0178, + "grad_norm": 3.0490307807922363, + "kl": 0.010607951902784407, + "learning_rate": 9.998545962862501e-07, + "loss": 0.0011, + "num_tokens": 769656.0, + "reward": 0.7401123046875, + "reward_std": 0.018074776977300644, + "rewards//mean": 0.7401123046875, + "rewards//std": 0.05264287069439888, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.018, + "grad_norm": 2.989919662475586, + "kl": 0.012744891719194129, + "learning_rate": 9.998468431599767e-07, + "loss": 0.0013, + "num_tokens": 778248.0, + "reward": 0.7086181640625, + "reward_std": 0.0159517303109169, + "rewards//mean": 0.7086181640625, + "rewards//std": 0.020442752167582512, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0182, + "grad_norm": 3.2538132667541504, + "kl": 0.013988890452310443, + "learning_rate": 9.998388886954545e-07, + "loss": 0.0014, + "num_tokens": 786856.0, + "reward": 0.69415283203125, + "reward_std": 0.017283614724874496, + "rewards//mean": 0.69415283203125, + "rewards//std": 0.0727759599685669, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0184, + "grad_norm": 3.338348388671875, + "kl": 0.013513089710613713, + "learning_rate": 9.998307328958877e-07, + "loss": 0.0014, + "num_tokens": 795544.0, + "reward": 0.696533203125, + "reward_std": 0.017394915223121643, + "rewards//mean": 0.696533203125, + "rewards//std": 0.06082624942064285, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0186, + "grad_norm": 3.17159104347229, + "kl": 0.012391012947773561, + "learning_rate": 9.998223757645617e-07, + "loss": 0.0012, + "num_tokens": 804104.0, + "reward": 0.7415771484375, + "reward_std": 0.01519560907036066, + "rewards//mean": 0.7415771484375, + "rewards//std": 0.05159846320748329, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0188, + "grad_norm": 2.9525721073150635, + "kl": 0.013435031520202756, + "learning_rate": 9.998138173048423e-07, + "loss": 0.0013, + "num_tokens": 812768.0, + "reward": 0.74493408203125, + "reward_std": 0.018135907128453255, + "rewards//mean": 0.74493408203125, + "rewards//std": 0.0533226802945137, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.019, + "grad_norm": 3.0639989376068115, + "kl": 0.013018126715905964, + "learning_rate": 9.99805057520177e-07, + "loss": 0.0013, + "num_tokens": 821400.0, + "reward": 0.74066162109375, + "reward_std": 0.013175277039408684, + "rewards//mean": 0.74066162109375, + "rewards//std": 0.04255594685673714, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0192, + "grad_norm": 3.360504627227783, + "kl": 0.011382284079445526, + "learning_rate": 9.997960964140945e-07, + "loss": 0.0011, + "num_tokens": 829952.0, + "reward": 0.7005615234375, + "reward_std": 0.011306056752800941, + "rewards//mean": 0.7005615234375, + "rewards//std": 0.03229600936174393, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0194, + "grad_norm": 3.1438801288604736, + "kl": 0.01552145613823086, + "learning_rate": 9.99786933990204e-07, + "loss": 0.0016, + "num_tokens": 838520.0, + "reward": 0.71697998046875, + "reward_std": 0.011182492598891258, + "rewards//mean": 0.71697998046875, + "rewards//std": 0.04406755790114403, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0196, + "grad_norm": 3.20588755607605, + "kl": 0.015421856835018843, + "learning_rate": 9.997775702521965e-07, + "loss": 0.0015, + "num_tokens": 847128.0, + "reward": 0.708740234375, + "reward_std": 0.014776019379496574, + "rewards//mean": 0.708740234375, + "rewards//std": 0.06943509727716446, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0198, + "grad_norm": 2.945568561553955, + "kl": 0.015174815838690847, + "learning_rate": 9.997680052038434e-07, + "loss": 0.0015, + "num_tokens": 855824.0, + "reward": 0.69830322265625, + "reward_std": 0.014003828167915344, + "rewards//mean": 0.69830322265625, + "rewards//std": 0.04895626753568649, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.02, + "grad_norm": 3.091007709503174, + "kl": 0.01613260098383762, + "learning_rate": 9.997582388489973e-07, + "loss": 0.0016, + "num_tokens": 864520.0, + "reward": 0.70916748046875, + "reward_std": 0.012781353667378426, + "rewards//mean": 0.70916748046875, + "rewards//std": 0.07657796889543533, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0202, + "grad_norm": 2.9633381366729736, + "kl": 0.01702519622631371, + "learning_rate": 9.997482711915925e-07, + "loss": 0.0017, + "num_tokens": 873152.0, + "reward": 0.7056884765625, + "reward_std": 0.015273596160113811, + "rewards//mean": 0.7056884765625, + "rewards//std": 0.05478581786155701, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0204, + "grad_norm": 3.0281424522399902, + "kl": 0.01615259307436645, + "learning_rate": 9.99738102235644e-07, + "loss": 0.0016, + "num_tokens": 881824.0, + "reward": 0.70928955078125, + "reward_std": 0.013487773947417736, + "rewards//mean": 0.70928955078125, + "rewards//std": 0.0424087829887867, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0206, + "grad_norm": 3.10520076751709, + "kl": 0.01550467952620238, + "learning_rate": 9.997277319852474e-07, + "loss": 0.0016, + "num_tokens": 890368.0, + "reward": 0.7510986328125, + "reward_std": 0.014099751599133015, + "rewards//mean": 0.7510986328125, + "rewards//std": 0.04322708770632744, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0208, + "grad_norm": 3.294306516647339, + "kl": 0.018466165813151747, + "learning_rate": 9.997171604445802e-07, + "loss": 0.0018, + "num_tokens": 899128.0, + "reward": 0.75274658203125, + "reward_std": 0.02102423831820488, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.04623865336179733, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.021, + "grad_norm": 2.8307101726531982, + "kl": 0.017229145538294688, + "learning_rate": 9.997063876179007e-07, + "loss": 0.0017, + "num_tokens": 907808.0, + "reward": 0.716064453125, + "reward_std": 0.015134226530790329, + "rewards//mean": 0.716064453125, + "rewards//std": 0.04795221611857414, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0212, + "grad_norm": 3.230468273162842, + "kl": 0.01487345719942823, + "learning_rate": 9.996954135095478e-07, + "loss": 0.0015, + "num_tokens": 916384.0, + "reward": 0.7489013671875, + "reward_std": 0.015160983428359032, + "rewards//mean": 0.7489013671875, + "rewards//std": 0.04460042715072632, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0214, + "grad_norm": 3.031060218811035, + "kl": 0.0202089183148928, + "learning_rate": 9.996842381239422e-07, + "loss": 0.002, + "num_tokens": 925000.0, + "reward": 0.73760986328125, + "reward_std": 0.01461248192936182, + "rewards//mean": 0.73760986328125, + "rewards//std": 0.028718072921037674, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0216, + "grad_norm": 2.955716609954834, + "kl": 0.017871063493657857, + "learning_rate": 9.996728614655853e-07, + "loss": 0.0018, + "num_tokens": 933680.0, + "reward": 0.72540283203125, + "reward_std": 0.012798861600458622, + "rewards//mean": 0.72540283203125, + "rewards//std": 0.034496817737817764, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0218, + "grad_norm": 3.2322795391082764, + "kl": 0.01609462348278612, + "learning_rate": 9.996612835390594e-07, + "loss": 0.0016, + "num_tokens": 942360.0, + "reward": 0.7109375, + "reward_std": 0.01488967053592205, + "rewards//mean": 0.7109375, + "rewards//std": 0.05337861180305481, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.022, + "grad_norm": 3.2382678985595703, + "kl": 0.02326445246580988, + "learning_rate": 9.996495043490283e-07, + "loss": 0.0023, + "num_tokens": 951000.0, + "reward": 0.75140380859375, + "reward_std": 0.012074257247149944, + "rewards//mean": 0.75140380859375, + "rewards//std": 0.04795974865555763, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0222, + "grad_norm": 3.04787278175354, + "kl": 0.01807693997398019, + "learning_rate": 9.996375239002368e-07, + "loss": 0.0018, + "num_tokens": 959688.0, + "reward": 0.6810302734375, + "reward_std": 0.014035972766578197, + "rewards//mean": 0.6810302734375, + "rewards//std": 0.06006631255149841, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0224, + "grad_norm": 2.985187292098999, + "kl": 0.02059358573751524, + "learning_rate": 9.996253421975102e-07, + "loss": 0.0021, + "num_tokens": 968352.0, + "reward": 0.73004150390625, + "reward_std": 0.012494131922721863, + "rewards//mean": 0.73004150390625, + "rewards//std": 0.04424411430954933, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0226, + "grad_norm": 2.9988229274749756, + "kl": 0.021581392036750913, + "learning_rate": 9.996129592457556e-07, + "loss": 0.0022, + "num_tokens": 976936.0, + "reward": 0.7276611328125, + "reward_std": 0.013638101518154144, + "rewards//mean": 0.7276611328125, + "rewards//std": 0.04925578832626343, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0228, + "grad_norm": 3.142057418823242, + "kl": 0.021858555090148002, + "learning_rate": 9.996003750499607e-07, + "loss": 0.0022, + "num_tokens": 985552.0, + "reward": 0.7265625, + "reward_std": 0.016436271369457245, + "rewards//mean": 0.7265625, + "rewards//std": 0.04292645305395126, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.023, + "grad_norm": 2.978050470352173, + "kl": 0.02196191088296473, + "learning_rate": 9.995875896151944e-07, + "loss": 0.0022, + "num_tokens": 994264.0, + "reward": 0.72979736328125, + "reward_std": 0.012108192779123783, + "rewards//mean": 0.72979736328125, + "rewards//std": 0.036838702857494354, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0232, + "grad_norm": 2.9693009853363037, + "kl": 0.02642357745207846, + "learning_rate": 9.99574602946607e-07, + "loss": 0.0026, + "num_tokens": 1002832.0, + "reward": 0.72235107421875, + "reward_std": 0.012415243312716484, + "rewards//mean": 0.72235107421875, + "rewards//std": 0.03544308617711067, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0234, + "grad_norm": 3.0097129344940186, + "kl": 0.023999035358428955, + "learning_rate": 9.99561415049429e-07, + "loss": 0.0024, + "num_tokens": 1011448.0, + "reward": 0.72015380859375, + "reward_std": 0.015053506940603256, + "rewards//mean": 0.72015380859375, + "rewards//std": 0.03507256507873535, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0236, + "grad_norm": 2.9691386222839355, + "kl": 0.022306351806037128, + "learning_rate": 9.99548025928973e-07, + "loss": 0.0022, + "num_tokens": 1020104.0, + "reward": 0.7171630859375, + "reward_std": 0.01403750479221344, + "rewards//mean": 0.7171630859375, + "rewards//std": 0.03787709400057793, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0238, + "grad_norm": 3.0105018615722656, + "kl": 0.01927804498700425, + "learning_rate": 9.995344355906318e-07, + "loss": 0.0019, + "num_tokens": 1028696.0, + "reward": 0.72906494140625, + "reward_std": 0.01614948734641075, + "rewards//mean": 0.72906494140625, + "rewards//std": 0.02800769917666912, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.024, + "grad_norm": 3.310129404067993, + "kl": 0.022020738862920552, + "learning_rate": 9.995206440398796e-07, + "loss": 0.0022, + "num_tokens": 1037384.0, + "reward": 0.70855712890625, + "reward_std": 0.012240133248269558, + "rewards//mean": 0.70855712890625, + "rewards//std": 0.038188494741916656, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0242, + "grad_norm": 3.0024490356445312, + "kl": 0.020602340518962592, + "learning_rate": 9.995066512822718e-07, + "loss": 0.0021, + "num_tokens": 1046176.0, + "reward": 0.713623046875, + "reward_std": 0.007958738133311272, + "rewards//mean": 0.713623046875, + "rewards//std": 0.04178783670067787, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0244, + "grad_norm": 3.0775814056396484, + "kl": 0.027243567805271596, + "learning_rate": 9.994924573234446e-07, + "loss": 0.0027, + "num_tokens": 1054816.0, + "reward": 0.71746826171875, + "reward_std": 0.016701359301805496, + "rewards//mean": 0.71746826171875, + "rewards//std": 0.058855876326560974, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0246, + "grad_norm": 3.011335611343384, + "kl": 0.02239358614315279, + "learning_rate": 9.994780621691154e-07, + "loss": 0.0022, + "num_tokens": 1063496.0, + "reward": 0.7353515625, + "reward_std": 0.013828590512275696, + "rewards//mean": 0.7353515625, + "rewards//std": 0.03586220741271973, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0248, + "grad_norm": 3.2017664909362793, + "kl": 0.02780767437070608, + "learning_rate": 9.994634658250824e-07, + "loss": 0.0028, + "num_tokens": 1072104.0, + "reward": 0.743408203125, + "reward_std": 0.012461268343031406, + "rewards//mean": 0.743408203125, + "rewards//std": 0.04123353585600853, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.025, + "grad_norm": 3.2028744220733643, + "kl": 0.022625074605457485, + "learning_rate": 9.994486682972252e-07, + "loss": 0.0023, + "num_tokens": 1080752.0, + "reward": 0.7366943359375, + "reward_std": 0.014579675160348415, + "rewards//mean": 0.7366943359375, + "rewards//std": 0.038674402981996536, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0252, + "grad_norm": 2.7825567722320557, + "kl": 0.025536290602758527, + "learning_rate": 9.99433669591504e-07, + "loss": 0.0026, + "num_tokens": 1089368.0, + "reward": 0.7174072265625, + "reward_std": 0.013983946293592453, + "rewards//mean": 0.7174072265625, + "rewards//std": 0.05010770633816719, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0254, + "grad_norm": 2.904855966567993, + "kl": 0.026453224942088127, + "learning_rate": 9.994184697139604e-07, + "loss": 0.0026, + "num_tokens": 1097992.0, + "reward": 0.74237060546875, + "reward_std": 0.013720160350203514, + "rewards//mean": 0.74237060546875, + "rewards//std": 0.026350749656558037, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0256, + "grad_norm": 3.1194891929626465, + "kl": 0.026587890926748514, + "learning_rate": 9.99403068670717e-07, + "loss": 0.0027, + "num_tokens": 1106576.0, + "reward": 0.7259521484375, + "reward_std": 0.01477651484310627, + "rewards//mean": 0.7259521484375, + "rewards//std": 0.03868379816412926, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0258, + "grad_norm": 2.9864237308502197, + "kl": 0.03235742053948343, + "learning_rate": 9.993874664679772e-07, + "loss": 0.0032, + "num_tokens": 1115160.0, + "reward": 0.71636962890625, + "reward_std": 0.013840307481586933, + "rewards//mean": 0.71636962890625, + "rewards//std": 0.043076373636722565, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.026, + "grad_norm": 3.104255199432373, + "kl": 0.02509330166503787, + "learning_rate": 9.993716631120258e-07, + "loss": 0.0025, + "num_tokens": 1123808.0, + "reward": 0.72076416015625, + "reward_std": 0.01157199963927269, + "rewards//mean": 0.72076416015625, + "rewards//std": 0.04962088167667389, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0262, + "grad_norm": 2.942652702331543, + "kl": 0.031184019171632826, + "learning_rate": 9.99355658609228e-07, + "loss": 0.0031, + "num_tokens": 1132528.0, + "reward": 0.7171630859375, + "reward_std": 0.015324447304010391, + "rewards//mean": 0.7171630859375, + "rewards//std": 0.04991641268134117, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0264, + "grad_norm": 2.9444375038146973, + "kl": 0.03304997179657221, + "learning_rate": 9.993394529660306e-07, + "loss": 0.0033, + "num_tokens": 1141160.0, + "reward": 0.736328125, + "reward_std": 0.013412285596132278, + "rewards//mean": 0.736328125, + "rewards//std": 0.032526031136512756, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0266, + "grad_norm": 3.2348594665527344, + "kl": 0.03439075197093189, + "learning_rate": 9.993230461889615e-07, + "loss": 0.0034, + "num_tokens": 1149744.0, + "reward": 0.72637939453125, + "reward_std": 0.015599433332681656, + "rewards//mean": 0.72637939453125, + "rewards//std": 0.03849054127931595, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0268, + "grad_norm": 3.5405704975128174, + "kl": 0.03284156124573201, + "learning_rate": 9.993064382846289e-07, + "loss": 0.0033, + "num_tokens": 1158344.0, + "reward": 0.72247314453125, + "reward_std": 0.015042467974126339, + "rewards//mean": 0.72247314453125, + "rewards//std": 0.04495181515812874, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.027, + "grad_norm": 3.0331664085388184, + "kl": 0.030899111938197166, + "learning_rate": 9.992896292597228e-07, + "loss": 0.0031, + "num_tokens": 1166920.0, + "reward": 0.68408203125, + "reward_std": 0.018408963456749916, + "rewards//mean": 0.68408203125, + "rewards//std": 0.03464268893003464, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0272, + "grad_norm": 3.180854082107544, + "kl": 0.03986567130777985, + "learning_rate": 9.992726191210137e-07, + "loss": 0.004, + "num_tokens": 1175528.0, + "reward": 0.73486328125, + "reward_std": 0.01281731203198433, + "rewards//mean": 0.73486328125, + "rewards//std": 0.04209749773144722, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0274, + "grad_norm": 2.917390823364258, + "kl": 0.031310008256696165, + "learning_rate": 9.992554078753533e-07, + "loss": 0.0031, + "num_tokens": 1184144.0, + "reward": 0.7275390625, + "reward_std": 0.016602514311671257, + "rewards//mean": 0.7275390625, + "rewards//std": 0.05057813972234726, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0276, + "grad_norm": 2.7335574626922607, + "kl": 0.030509869335219264, + "learning_rate": 9.992379955296745e-07, + "loss": 0.0031, + "num_tokens": 1192832.0, + "reward": 0.7412109375, + "reward_std": 0.016559338197112083, + "rewards//mean": 0.7412109375, + "rewards//std": 0.043858595192432404, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0278, + "grad_norm": 3.168146848678589, + "kl": 0.04336894187144935, + "learning_rate": 9.992203820909905e-07, + "loss": 0.0043, + "num_tokens": 1201472.0, + "reward": 0.7071533203125, + "reward_std": 0.012441834434866905, + "rewards//mean": 0.7071533203125, + "rewards//std": 0.056705132126808167, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.028, + "grad_norm": 2.996670722961426, + "kl": 0.03549187898170203, + "learning_rate": 9.992025675663965e-07, + "loss": 0.0035, + "num_tokens": 1210176.0, + "reward": 0.750732421875, + "reward_std": 0.01398580614477396, + "rewards//mean": 0.750732421875, + "rewards//std": 0.03658350929617882, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0282, + "grad_norm": 3.4842395782470703, + "kl": 0.03604160330723971, + "learning_rate": 9.991845519630676e-07, + "loss": 0.0036, + "num_tokens": 1218872.0, + "reward": 0.73309326171875, + "reward_std": 0.01453987043350935, + "rewards//mean": 0.73309326171875, + "rewards//std": 0.03210673853754997, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0284, + "grad_norm": 3.1160051822662354, + "kl": 0.03330606734380126, + "learning_rate": 9.991663352882613e-07, + "loss": 0.0033, + "num_tokens": 1227624.0, + "reward": 0.732666015625, + "reward_std": 0.013473456725478172, + "rewards//mean": 0.732666015625, + "rewards//std": 0.043580908328294754, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0286, + "grad_norm": 2.8810338973999023, + "kl": 0.041373745538294315, + "learning_rate": 9.991479175493148e-07, + "loss": 0.0041, + "num_tokens": 1236264.0, + "reward": 0.7386474609375, + "reward_std": 0.011745231226086617, + "rewards//mean": 0.7386474609375, + "rewards//std": 0.03809388726949692, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0288, + "grad_norm": 3.0718138217926025, + "kl": 0.035379409266170114, + "learning_rate": 9.991292987536468e-07, + "loss": 0.0035, + "num_tokens": 1244984.0, + "reward": 0.7083740234375, + "reward_std": 0.01356479525566101, + "rewards//mean": 0.7083740234375, + "rewards//std": 0.07008899748325348, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.029, + "grad_norm": 2.791430950164795, + "kl": 0.034212324768304825, + "learning_rate": 9.991104789087569e-07, + "loss": 0.0034, + "num_tokens": 1253544.0, + "reward": 0.69757080078125, + "reward_std": 0.013164354488253593, + "rewards//mean": 0.69757080078125, + "rewards//std": 0.04364160820841789, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0292, + "grad_norm": 2.9670677185058594, + "kl": 0.04070010129362345, + "learning_rate": 9.990914580222255e-07, + "loss": 0.0041, + "num_tokens": 1262272.0, + "reward": 0.75714111328125, + "reward_std": 0.014646430499851704, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.04350125044584274, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0294, + "grad_norm": 3.1393849849700928, + "kl": 0.03216705098748207, + "learning_rate": 9.990722361017149e-07, + "loss": 0.0032, + "num_tokens": 1270984.0, + "reward": 0.7379150390625, + "reward_std": 0.015045834705233574, + "rewards//mean": 0.7379150390625, + "rewards//std": 0.0475710891187191, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0296, + "grad_norm": 3.0052967071533203, + "kl": 0.0336679095053114, + "learning_rate": 9.990528131549671e-07, + "loss": 0.0034, + "num_tokens": 1279664.0, + "reward": 0.72052001953125, + "reward_std": 0.016495231539011, + "rewards//mean": 0.72052001953125, + "rewards//std": 0.04428686201572418, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0298, + "grad_norm": 2.77880597114563, + "kl": 0.04252167057711631, + "learning_rate": 9.990331891898058e-07, + "loss": 0.0043, + "num_tokens": 1288360.0, + "reward": 0.72564697265625, + "reward_std": 0.013310113921761513, + "rewards//mean": 0.72564697265625, + "rewards//std": 0.03551433980464935, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.03, + "grad_norm": 2.8032565116882324, + "kl": 0.034908757312223315, + "learning_rate": 9.990133642141357e-07, + "loss": 0.0035, + "num_tokens": 1297032.0, + "reward": 0.74017333984375, + "reward_std": 0.011333253234624863, + "rewards//mean": 0.74017333984375, + "rewards//std": 0.03461815416812897, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0302, + "grad_norm": 2.9779255390167236, + "kl": 0.03928355360403657, + "learning_rate": 9.989933382359422e-07, + "loss": 0.0039, + "num_tokens": 1305632.0, + "reward": 0.712158203125, + "reward_std": 0.012044407427310944, + "rewards//mean": 0.712158203125, + "rewards//std": 0.040784675627946854, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0304, + "grad_norm": 3.1361095905303955, + "kl": 0.04866291838698089, + "learning_rate": 9.989731112632916e-07, + "loss": 0.0049, + "num_tokens": 1314272.0, + "reward": 0.72283935546875, + "reward_std": 0.013317112810909748, + "rewards//mean": 0.72283935546875, + "rewards//std": 0.04381676763296127, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0306, + "grad_norm": 2.8830525875091553, + "kl": 0.04183753626421094, + "learning_rate": 9.989526833043316e-07, + "loss": 0.0042, + "num_tokens": 1322960.0, + "reward": 0.76031494140625, + "reward_std": 0.01955876313149929, + "rewards//mean": 0.76031494140625, + "rewards//std": 0.05183799937367439, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0308, + "grad_norm": 2.8472084999084473, + "kl": 0.04606934660114348, + "learning_rate": 9.989320543672903e-07, + "loss": 0.0046, + "num_tokens": 1331608.0, + "reward": 0.73382568359375, + "reward_std": 0.016142776235938072, + "rewards//mean": 0.73382568359375, + "rewards//std": 0.054491691291332245, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.031, + "grad_norm": 2.762299060821533, + "kl": 0.03941800841130316, + "learning_rate": 9.989112244604771e-07, + "loss": 0.0039, + "num_tokens": 1340352.0, + "reward": 0.73822021484375, + "reward_std": 0.013351533561944962, + "rewards//mean": 0.73822021484375, + "rewards//std": 0.04714544862508774, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0312, + "grad_norm": 3.0830838680267334, + "kl": 0.03498400142416358, + "learning_rate": 9.988901935922825e-07, + "loss": 0.0035, + "num_tokens": 1349024.0, + "reward": 0.72576904296875, + "reward_std": 0.013017626479268074, + "rewards//mean": 0.72576904296875, + "rewards//std": 0.03293436020612717, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0314, + "grad_norm": 3.0565831661224365, + "kl": 0.04686063149711117, + "learning_rate": 9.988689617711776e-07, + "loss": 0.0047, + "num_tokens": 1357544.0, + "reward": 0.73284912109375, + "reward_std": 0.013014718890190125, + "rewards//mean": 0.73284912109375, + "rewards//std": 0.0456744059920311, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0316, + "grad_norm": 2.797093629837036, + "kl": 0.05101523862686008, + "learning_rate": 9.988475290057143e-07, + "loss": 0.0051, + "num_tokens": 1366224.0, + "reward": 0.734130859375, + "reward_std": 0.011036617681384087, + "rewards//mean": 0.734130859375, + "rewards//std": 0.04602967947721481, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0318, + "grad_norm": 3.194139242172241, + "kl": 0.05134878121316433, + "learning_rate": 9.988258953045262e-07, + "loss": 0.0051, + "num_tokens": 1374848.0, + "reward": 0.73370361328125, + "reward_std": 0.01682290807366371, + "rewards//mean": 0.73370361328125, + "rewards//std": 0.043828513473272324, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.032, + "grad_norm": 3.09578537940979, + "kl": 0.0347396379802376, + "learning_rate": 9.988040606763272e-07, + "loss": 0.0035, + "num_tokens": 1383456.0, + "reward": 0.69970703125, + "reward_std": 0.014353256672620773, + "rewards//mean": 0.69970703125, + "rewards//std": 0.05614941567182541, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0322, + "grad_norm": 3.0768558979034424, + "kl": 0.0460287892492488, + "learning_rate": 9.98782025129912e-07, + "loss": 0.0046, + "num_tokens": 1392112.0, + "reward": 0.72900390625, + "reward_std": 0.01641533523797989, + "rewards//mean": 0.72900390625, + "rewards//std": 0.0477275513112545, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0324, + "grad_norm": 2.6946260929107666, + "kl": 0.04967822623439133, + "learning_rate": 9.987597886741568e-07, + "loss": 0.005, + "num_tokens": 1400784.0, + "reward": 0.75042724609375, + "reward_std": 0.013429421000182629, + "rewards//mean": 0.75042724609375, + "rewards//std": 0.04385371878743172, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0326, + "grad_norm": 2.928901195526123, + "kl": 0.04909839539323002, + "learning_rate": 9.987373513180184e-07, + "loss": 0.0049, + "num_tokens": 1409344.0, + "reward": 0.7476806640625, + "reward_std": 0.013493198901414871, + "rewards//mean": 0.7476806640625, + "rewards//std": 0.03423633053898811, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0328, + "grad_norm": 2.9144558906555176, + "kl": 0.050017297733575106, + "learning_rate": 9.987147130705347e-07, + "loss": 0.005, + "num_tokens": 1417920.0, + "reward": 0.73626708984375, + "reward_std": 0.01147711556404829, + "rewards//mean": 0.73626708984375, + "rewards//std": 0.0361848808825016, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.033, + "grad_norm": 3.2065091133117676, + "kl": 0.05320246773771942, + "learning_rate": 9.98691873940824e-07, + "loss": 0.0053, + "num_tokens": 1426608.0, + "reward": 0.72613525390625, + "reward_std": 0.016028691083192825, + "rewards//mean": 0.72613525390625, + "rewards//std": 0.040774233639240265, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0332, + "grad_norm": 2.9466049671173096, + "kl": 0.05405107664410025, + "learning_rate": 9.98668833938086e-07, + "loss": 0.0054, + "num_tokens": 1435216.0, + "reward": 0.7215576171875, + "reward_std": 0.015404738485813141, + "rewards//mean": 0.7215576171875, + "rewards//std": 0.03980853036046028, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0334, + "grad_norm": 2.866468667984009, + "kl": 0.044184350059367716, + "learning_rate": 9.986455930716016e-07, + "loss": 0.0044, + "num_tokens": 1443832.0, + "reward": 0.6962890625, + "reward_std": 0.014004740864038467, + "rewards//mean": 0.6962890625, + "rewards//std": 0.0615786537528038, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0336, + "grad_norm": 2.7617156505584717, + "kl": 0.0638790549710393, + "learning_rate": 9.986221513507318e-07, + "loss": 0.0064, + "num_tokens": 1452488.0, + "reward": 0.7462158203125, + "reward_std": 0.014732494950294495, + "rewards//mean": 0.7462158203125, + "rewards//std": 0.038838449865579605, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0338, + "grad_norm": 2.615412950515747, + "kl": 0.047671781037934124, + "learning_rate": 9.985985087849191e-07, + "loss": 0.0048, + "num_tokens": 1461184.0, + "reward": 0.740966796875, + "reward_std": 0.010601690039038658, + "rewards//mean": 0.740966796875, + "rewards//std": 0.04212266206741333, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.034, + "grad_norm": 2.4398818016052246, + "kl": 0.04848790564574301, + "learning_rate": 9.985746653836866e-07, + "loss": 0.0048, + "num_tokens": 1469920.0, + "reward": 0.74945068359375, + "reward_std": 0.0126343360170722, + "rewards//mean": 0.74945068359375, + "rewards//std": 0.05037597566843033, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0342, + "grad_norm": 2.9143829345703125, + "kl": 0.05843106552492827, + "learning_rate": 9.985506211566386e-07, + "loss": 0.0058, + "num_tokens": 1478560.0, + "reward": 0.722412109375, + "reward_std": 0.015987034887075424, + "rewards//mean": 0.722412109375, + "rewards//std": 0.04930692911148071, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0344, + "grad_norm": 2.8919947147369385, + "kl": 0.04990722984075546, + "learning_rate": 9.9852637611346e-07, + "loss": 0.005, + "num_tokens": 1487232.0, + "reward": 0.6947021484375, + "reward_std": 0.012111399322748184, + "rewards//mean": 0.6947021484375, + "rewards//std": 0.05466189235448837, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0346, + "grad_norm": 2.790154218673706, + "kl": 0.06499220291152596, + "learning_rate": 9.98501930263917e-07, + "loss": 0.0065, + "num_tokens": 1495848.0, + "reward": 0.71551513671875, + "reward_std": 0.014181406237185001, + "rewards//mean": 0.71551513671875, + "rewards//std": 0.04905419051647186, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0348, + "grad_norm": 2.9558165073394775, + "kl": 0.0698660952039063, + "learning_rate": 9.984772836178556e-07, + "loss": 0.007, + "num_tokens": 1504680.0, + "reward": 0.73541259765625, + "reward_std": 0.013052749447524548, + "rewards//mean": 0.73541259765625, + "rewards//std": 0.04531572014093399, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.035, + "grad_norm": 2.8896403312683105, + "kl": 0.06764502776786685, + "learning_rate": 9.984524361852043e-07, + "loss": 0.0068, + "num_tokens": 1513360.0, + "reward": 0.712890625, + "reward_std": 0.009989009238779545, + "rewards//mean": 0.712890625, + "rewards//std": 0.05103578418493271, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0352, + "grad_norm": 2.751380443572998, + "kl": 0.06374173518270254, + "learning_rate": 9.984273879759712e-07, + "loss": 0.0064, + "num_tokens": 1522112.0, + "reward": 0.73516845703125, + "reward_std": 0.01333437766879797, + "rewards//mean": 0.73516845703125, + "rewards//std": 0.03761053830385208, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0354, + "grad_norm": 3.128718137741089, + "kl": 0.07074581575579941, + "learning_rate": 9.984021390002457e-07, + "loss": 0.0071, + "num_tokens": 1530848.0, + "reward": 0.70831298828125, + "reward_std": 0.011916648596525192, + "rewards//mean": 0.70831298828125, + "rewards//std": 0.043787047266960144, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0356, + "grad_norm": 3.010329246520996, + "kl": 0.07081070146523416, + "learning_rate": 9.983766892681985e-07, + "loss": 0.0071, + "num_tokens": 1539528.0, + "reward": 0.7213134765625, + "reward_std": 0.014988021925091743, + "rewards//mean": 0.7213134765625, + "rewards//std": 0.031827643513679504, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0358, + "grad_norm": 2.772942304611206, + "kl": 0.06943181017413735, + "learning_rate": 9.983510387900802e-07, + "loss": 0.0069, + "num_tokens": 1548192.0, + "reward": 0.70550537109375, + "reward_std": 0.011353434063494205, + "rewards//mean": 0.70550537109375, + "rewards//std": 0.05040841922163963, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.036, + "grad_norm": 2.921886682510376, + "kl": 0.07014578208327293, + "learning_rate": 9.983251875762232e-07, + "loss": 0.007, + "num_tokens": 1556856.0, + "reward": 0.7374267578125, + "reward_std": 0.015500213950872421, + "rewards//mean": 0.7374267578125, + "rewards//std": 0.04746277630329132, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0362, + "grad_norm": 2.5986506938934326, + "kl": 0.06503755692392588, + "learning_rate": 9.982991356370403e-07, + "loss": 0.0065, + "num_tokens": 1565488.0, + "reward": 0.7337646484375, + "reward_std": 0.012532995082437992, + "rewards//mean": 0.7337646484375, + "rewards//std": 0.04871561378240585, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0364, + "grad_norm": 2.6686618328094482, + "kl": 0.07410385878756642, + "learning_rate": 9.98272882983025e-07, + "loss": 0.0074, + "num_tokens": 1574184.0, + "reward": 0.73846435546875, + "reward_std": 0.01319533959031105, + "rewards//mean": 0.73846435546875, + "rewards//std": 0.05068543553352356, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0366, + "grad_norm": 3.029433250427246, + "kl": 0.07510905456729233, + "learning_rate": 9.982464296247522e-07, + "loss": 0.0075, + "num_tokens": 1582888.0, + "reward": 0.735107421875, + "reward_std": 0.013636925257742405, + "rewards//mean": 0.735107421875, + "rewards//std": 0.03843037039041519, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0368, + "grad_norm": 2.4225409030914307, + "kl": 0.07150726299732924, + "learning_rate": 9.98219775572877e-07, + "loss": 0.0072, + "num_tokens": 1591408.0, + "reward": 0.75115966796875, + "reward_std": 0.014323122799396515, + "rewards//mean": 0.75115966796875, + "rewards//std": 0.036202866584062576, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.037, + "grad_norm": 2.533130168914795, + "kl": 0.07464027963578701, + "learning_rate": 9.981929208381357e-07, + "loss": 0.0075, + "num_tokens": 1600088.0, + "reward": 0.76458740234375, + "reward_std": 0.014717087149620056, + "rewards//mean": 0.76458740234375, + "rewards//std": 0.04011625796556473, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0372, + "grad_norm": 2.432730197906494, + "kl": 0.0943055716343224, + "learning_rate": 9.981658654313456e-07, + "loss": 0.0094, + "num_tokens": 1608712.0, + "reward": 0.7225341796875, + "reward_std": 0.0087115578353405, + "rewards//mean": 0.7225341796875, + "rewards//std": 0.04055297002196312, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0374, + "grad_norm": 2.751760482788086, + "kl": 0.08527720882557333, + "learning_rate": 9.981386093634045e-07, + "loss": 0.0085, + "num_tokens": 1617400.0, + "reward": 0.75360107421875, + "reward_std": 0.013320360332727432, + "rewards//mean": 0.75360107421875, + "rewards//std": 0.028614573180675507, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0376, + "grad_norm": 2.9692792892456055, + "kl": 0.07570782792754471, + "learning_rate": 9.98111152645291e-07, + "loss": 0.0076, + "num_tokens": 1625992.0, + "reward": 0.76275634765625, + "reward_std": 0.015970878303050995, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.037092190235853195, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0378, + "grad_norm": 3.0419697761535645, + "kl": 0.06978122459258884, + "learning_rate": 9.98083495288065e-07, + "loss": 0.007, + "num_tokens": 1634576.0, + "reward": 0.7088623046875, + "reward_std": 0.01774086058139801, + "rewards//mean": 0.7088623046875, + "rewards//std": 0.04886329919099808, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.038, + "grad_norm": 2.7983527183532715, + "kl": 0.06901918211951852, + "learning_rate": 9.980556373028665e-07, + "loss": 0.0069, + "num_tokens": 1643200.0, + "reward": 0.737548828125, + "reward_std": 0.012527244165539742, + "rewards//mean": 0.737548828125, + "rewards//std": 0.038499634712934494, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0382, + "grad_norm": 2.493257522583008, + "kl": 0.08206672128289938, + "learning_rate": 9.98027578700917e-07, + "loss": 0.0082, + "num_tokens": 1651848.0, + "reward": 0.70538330078125, + "reward_std": 0.01150240283459425, + "rewards//mean": 0.70538330078125, + "rewards//std": 0.03535756468772888, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0384, + "grad_norm": 2.7214250564575195, + "kl": 0.08754912205040455, + "learning_rate": 9.979993194935182e-07, + "loss": 0.0088, + "num_tokens": 1660472.0, + "reward": 0.73846435546875, + "reward_std": 0.013667328283190727, + "rewards//mean": 0.73846435546875, + "rewards//std": 0.04352281987667084, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0386, + "grad_norm": 2.9392783641815186, + "kl": 0.07446632068604231, + "learning_rate": 9.979708596920529e-07, + "loss": 0.0074, + "num_tokens": 1669128.0, + "reward": 0.750244140625, + "reward_std": 0.015270931646227837, + "rewards//mean": 0.750244140625, + "rewards//std": 0.030583124607801437, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0388, + "grad_norm": 3.087444543838501, + "kl": 0.08147471048869193, + "learning_rate": 9.97942199307985e-07, + "loss": 0.0081, + "num_tokens": 1677784.0, + "reward": 0.7628173828125, + "reward_std": 0.014745705761015415, + "rewards//mean": 0.7628173828125, + "rewards//std": 0.04542369768023491, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.039, + "grad_norm": 2.8398241996765137, + "kl": 0.0784428627230227, + "learning_rate": 9.97913338352859e-07, + "loss": 0.0078, + "num_tokens": 1686448.0, + "reward": 0.735107421875, + "reward_std": 0.012391097843647003, + "rewards//mean": 0.735107421875, + "rewards//std": 0.030039772391319275, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0392, + "grad_norm": 2.49025821685791, + "kl": 0.08109743148088455, + "learning_rate": 9.978842768382998e-07, + "loss": 0.0081, + "num_tokens": 1695072.0, + "reward": 0.70452880859375, + "reward_std": 0.013618113473057747, + "rewards//mean": 0.70452880859375, + "rewards//std": 0.03166856989264488, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0394, + "grad_norm": 3.270029306411743, + "kl": 0.08498156163841486, + "learning_rate": 9.978550147760131e-07, + "loss": 0.0085, + "num_tokens": 1703680.0, + "reward": 0.73675537109375, + "reward_std": 0.017492208629846573, + "rewards//mean": 0.73675537109375, + "rewards//std": 0.0403052382171154, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0396, + "grad_norm": 2.5561161041259766, + "kl": 0.0859982690308243, + "learning_rate": 9.978255521777862e-07, + "loss": 0.0086, + "num_tokens": 1712304.0, + "reward": 0.73126220703125, + "reward_std": 0.0102681340649724, + "rewards//mean": 0.73126220703125, + "rewards//std": 0.045068852603435516, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0398, + "grad_norm": 3.171823263168335, + "kl": 0.07568260550033301, + "learning_rate": 9.977958890554866e-07, + "loss": 0.0076, + "num_tokens": 1720936.0, + "reward": 0.71221923828125, + "reward_std": 0.01587653160095215, + "rewards//mean": 0.71221923828125, + "rewards//std": 0.03565939515829086, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.04, + "grad_norm": 2.9293112754821777, + "kl": 0.07837174762971699, + "learning_rate": 9.97766025421062e-07, + "loss": 0.0078, + "num_tokens": 1729552.0, + "reward": 0.72186279296875, + "reward_std": 0.014022290706634521, + "rewards//mean": 0.72186279296875, + "rewards//std": 0.03507126867771149, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0402, + "grad_norm": 2.8085222244262695, + "kl": 0.08167064702138305, + "learning_rate": 9.977359612865422e-07, + "loss": 0.0082, + "num_tokens": 1738184.0, + "reward": 0.71942138671875, + "reward_std": 0.012507260777056217, + "rewards//mean": 0.71942138671875, + "rewards//std": 0.0408717580139637, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0404, + "grad_norm": 3.0235538482666016, + "kl": 0.07556112413294613, + "learning_rate": 9.977056966640367e-07, + "loss": 0.0076, + "num_tokens": 1746792.0, + "reward": 0.73321533203125, + "reward_std": 0.01591806672513485, + "rewards//mean": 0.73321533203125, + "rewards//std": 0.0390729159116745, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0406, + "grad_norm": 2.861449718475342, + "kl": 0.07926510332617909, + "learning_rate": 9.976752315657359e-07, + "loss": 0.0079, + "num_tokens": 1755408.0, + "reward": 0.744140625, + "reward_std": 0.01228757668286562, + "rewards//mean": 0.744140625, + "rewards//std": 0.03467413783073425, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0408, + "grad_norm": 3.436288595199585, + "kl": 0.09631677670404315, + "learning_rate": 9.976445660039117e-07, + "loss": 0.0096, + "num_tokens": 1764008.0, + "reward": 0.74468994140625, + "reward_std": 0.012933210469782352, + "rewards//mean": 0.74468994140625, + "rewards//std": 0.03882846236228943, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.041, + "grad_norm": 2.9841854572296143, + "kl": 0.09829538897611201, + "learning_rate": 9.976136999909155e-07, + "loss": 0.0098, + "num_tokens": 1772688.0, + "reward": 0.7412109375, + "reward_std": 0.010125808417797089, + "rewards//mean": 0.7412109375, + "rewards//std": 0.041701529175043106, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0412, + "grad_norm": 3.1280481815338135, + "kl": 0.09879924496635795, + "learning_rate": 9.975826335391805e-07, + "loss": 0.0099, + "num_tokens": 1781256.0, + "reward": 0.74725341796875, + "reward_std": 0.01598658226430416, + "rewards//mean": 0.74725341796875, + "rewards//std": 0.041038088500499725, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0414, + "grad_norm": 2.806591272354126, + "kl": 0.08446787379216403, + "learning_rate": 9.975513666612203e-07, + "loss": 0.0084, + "num_tokens": 1789976.0, + "reward": 0.74365234375, + "reward_std": 0.01498311199247837, + "rewards//mean": 0.74365234375, + "rewards//std": 0.05660908296704292, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0416, + "grad_norm": 2.473963737487793, + "kl": 0.09275762271136045, + "learning_rate": 9.975198993696291e-07, + "loss": 0.0093, + "num_tokens": 1798664.0, + "reward": 0.7291259765625, + "reward_std": 0.013060636818408966, + "rewards//mean": 0.7291259765625, + "rewards//std": 0.04006778821349144, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0418, + "grad_norm": 2.5815858840942383, + "kl": 0.09844274073839188, + "learning_rate": 9.97488231677082e-07, + "loss": 0.0098, + "num_tokens": 1807424.0, + "reward": 0.68670654296875, + "reward_std": 0.013432216830551624, + "rewards//mean": 0.68670654296875, + "rewards//std": 0.04699461907148361, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.042, + "grad_norm": 2.440018892288208, + "kl": 0.09588717669248581, + "learning_rate": 9.974563635963347e-07, + "loss": 0.0096, + "num_tokens": 1816088.0, + "reward": 0.72821044921875, + "reward_std": 0.009933840483427048, + "rewards//mean": 0.72821044921875, + "rewards//std": 0.04099749028682709, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0422, + "grad_norm": 2.6709070205688477, + "kl": 0.09110309137031436, + "learning_rate": 9.974242951402235e-07, + "loss": 0.0091, + "num_tokens": 1824672.0, + "reward": 0.69232177734375, + "reward_std": 0.011585031636059284, + "rewards//mean": 0.69232177734375, + "rewards//std": 0.046945951879024506, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0424, + "grad_norm": 3.0153114795684814, + "kl": 0.10191798605956137, + "learning_rate": 9.973920263216657e-07, + "loss": 0.0102, + "num_tokens": 1833248.0, + "reward": 0.7841796875, + "reward_std": 0.014959658496081829, + "rewards//mean": 0.7841796875, + "rewards//std": 0.03194751217961311, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0426, + "grad_norm": 2.5440561771392822, + "kl": 0.0974107151851058, + "learning_rate": 9.97359557153659e-07, + "loss": 0.0097, + "num_tokens": 1841808.0, + "reward": 0.7314453125, + "reward_std": 0.009747691452503204, + "rewards//mean": 0.7314453125, + "rewards//std": 0.03378254920244217, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0428, + "grad_norm": 3.8489737510681152, + "kl": 0.0931732514873147, + "learning_rate": 9.973268876492825e-07, + "loss": 0.0093, + "num_tokens": 1850392.0, + "reward": 0.7232666015625, + "reward_std": 0.019548147916793823, + "rewards//mean": 0.7232666015625, + "rewards//std": 0.04620479792356491, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.043, + "grad_norm": 3.214967966079712, + "kl": 0.09402831085026264, + "learning_rate": 9.972940178216952e-07, + "loss": 0.0094, + "num_tokens": 1859016.0, + "reward": 0.7628173828125, + "reward_std": 0.010977610945701599, + "rewards//mean": 0.7628173828125, + "rewards//std": 0.04047824442386627, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0432, + "grad_norm": 3.1702773571014404, + "kl": 0.09994646161794662, + "learning_rate": 9.972609476841365e-07, + "loss": 0.01, + "num_tokens": 1867616.0, + "reward": 0.739990234375, + "reward_std": 0.008601821959018707, + "rewards//mean": 0.739990234375, + "rewards//std": 0.022733250632882118, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0434, + "grad_norm": 2.3969881534576416, + "kl": 0.09891431382857263, + "learning_rate": 9.97227677249928e-07, + "loss": 0.0099, + "num_tokens": 1876296.0, + "reward": 0.74725341796875, + "reward_std": 0.013425001874566078, + "rewards//mean": 0.74725341796875, + "rewards//std": 0.038598936051130295, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0436, + "grad_norm": 2.8317806720733643, + "kl": 0.10404033353552222, + "learning_rate": 9.971942065324702e-07, + "loss": 0.0104, + "num_tokens": 1884904.0, + "reward": 0.74896240234375, + "reward_std": 0.01664729230105877, + "rewards//mean": 0.74896240234375, + "rewards//std": 0.03809085860848427, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0438, + "grad_norm": 2.7183804512023926, + "kl": 0.09752598311752081, + "learning_rate": 9.971605355452457e-07, + "loss": 0.0098, + "num_tokens": 1893616.0, + "reward": 0.7247314453125, + "reward_std": 0.0124040637165308, + "rewards//mean": 0.7247314453125, + "rewards//std": 0.04993703216314316, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.044, + "grad_norm": 3.6828420162200928, + "kl": 0.10862272512167692, + "learning_rate": 9.97126664301817e-07, + "loss": 0.0109, + "num_tokens": 1902160.0, + "reward": 0.6983642578125, + "reward_std": 0.01596534624695778, + "rewards//mean": 0.6983642578125, + "rewards//std": 0.04885586351156235, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0442, + "grad_norm": 2.340144634246826, + "kl": 0.10954310419037938, + "learning_rate": 9.970925928158272e-07, + "loss": 0.011, + "num_tokens": 1910880.0, + "reward": 0.731689453125, + "reward_std": 0.01111997477710247, + "rewards//mean": 0.731689453125, + "rewards//std": 0.035861365497112274, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0444, + "grad_norm": 3.0173556804656982, + "kl": 0.11066217673942447, + "learning_rate": 9.970583211010007e-07, + "loss": 0.0111, + "num_tokens": 1919640.0, + "reward": 0.70489501953125, + "reward_std": 0.015776721760630608, + "rewards//mean": 0.70489501953125, + "rewards//std": 0.0446137897670269, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0446, + "grad_norm": 2.8475310802459717, + "kl": 0.10389193054288626, + "learning_rate": 9.970238491711415e-07, + "loss": 0.0104, + "num_tokens": 1928296.0, + "reward": 0.72418212890625, + "reward_std": 0.011221460998058319, + "rewards//mean": 0.72418212890625, + "rewards//std": 0.032448623329401016, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0448, + "grad_norm": 2.4673573970794678, + "kl": 0.10303299408406019, + "learning_rate": 9.969891770401356e-07, + "loss": 0.0103, + "num_tokens": 1937088.0, + "reward": 0.751708984375, + "reward_std": 0.010914693586528301, + "rewards//mean": 0.751708984375, + "rewards//std": 0.03294682502746582, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.045, + "grad_norm": 3.270146608352661, + "kl": 0.11207640403881669, + "learning_rate": 9.969543047219486e-07, + "loss": 0.0112, + "num_tokens": 1945688.0, + "reward": 0.75653076171875, + "reward_std": 0.016141315922141075, + "rewards//mean": 0.75653076171875, + "rewards//std": 0.0417482852935791, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0452, + "grad_norm": 2.757606029510498, + "kl": 0.10813413886353374, + "learning_rate": 9.96919232230627e-07, + "loss": 0.0108, + "num_tokens": 1954320.0, + "reward": 0.74822998046875, + "reward_std": 0.012435732409358025, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.039290811866521835, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0454, + "grad_norm": 2.6634392738342285, + "kl": 0.11925068125128746, + "learning_rate": 9.968839595802981e-07, + "loss": 0.0119, + "num_tokens": 1962944.0, + "reward": 0.716552734375, + "reward_std": 0.010293394327163696, + "rewards//mean": 0.716552734375, + "rewards//std": 0.027611492201685905, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0456, + "grad_norm": 3.6396124362945557, + "kl": 0.12086705304682255, + "learning_rate": 9.968484867851697e-07, + "loss": 0.0121, + "num_tokens": 1971624.0, + "reward": 0.758544921875, + "reward_std": 0.015222180634737015, + "rewards//mean": 0.758544921875, + "rewards//std": 0.05078781023621559, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0458, + "grad_norm": 2.7004621028900146, + "kl": 0.1087592770345509, + "learning_rate": 9.968128138595302e-07, + "loss": 0.0109, + "num_tokens": 1980280.0, + "reward": 0.71453857421875, + "reward_std": 0.013990378007292747, + "rewards//mean": 0.71453857421875, + "rewards//std": 0.03948374465107918, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.046, + "grad_norm": 3.056368589401245, + "kl": 0.11676244903355837, + "learning_rate": 9.967769408177488e-07, + "loss": 0.0117, + "num_tokens": 1988880.0, + "reward": 0.69268798828125, + "reward_std": 0.013287386856973171, + "rewards//mean": 0.69268798828125, + "rewards//std": 0.05184588208794594, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0462, + "grad_norm": 2.9305176734924316, + "kl": 0.1197158400900662, + "learning_rate": 9.967408676742751e-07, + "loss": 0.012, + "num_tokens": 1997536.0, + "reward": 0.74053955078125, + "reward_std": 0.013810476288199425, + "rewards//mean": 0.74053955078125, + "rewards//std": 0.04055796191096306, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0464, + "grad_norm": 3.2707579135894775, + "kl": 0.13016177900135517, + "learning_rate": 9.967045944436393e-07, + "loss": 0.013, + "num_tokens": 2006280.0, + "reward": 0.72113037109375, + "reward_std": 0.011619940400123596, + "rewards//mean": 0.72113037109375, + "rewards//std": 0.03844134882092476, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0466, + "grad_norm": 3.191673517227173, + "kl": 0.14259618474170566, + "learning_rate": 9.96668121140452e-07, + "loss": 0.0143, + "num_tokens": 2015040.0, + "reward": 0.756103515625, + "reward_std": 0.016418365761637688, + "rewards//mean": 0.756103515625, + "rewards//std": 0.039689138531684875, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0468, + "grad_norm": 3.194607973098755, + "kl": 0.13389609195291996, + "learning_rate": 9.966314477794052e-07, + "loss": 0.0134, + "num_tokens": 2023640.0, + "reward": 0.7322998046875, + "reward_std": 0.010810410603880882, + "rewards//mean": 0.7322998046875, + "rewards//std": 0.037917040288448334, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.047, + "grad_norm": 2.7117807865142822, + "kl": 0.1349084647372365, + "learning_rate": 9.965945743752705e-07, + "loss": 0.0135, + "num_tokens": 2032216.0, + "reward": 0.73382568359375, + "reward_std": 0.011349475011229515, + "rewards//mean": 0.73382568359375, + "rewards//std": 0.048444636166095734, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0472, + "grad_norm": 3.4043350219726562, + "kl": 0.15029606316238642, + "learning_rate": 9.965575009429005e-07, + "loss": 0.015, + "num_tokens": 2040856.0, + "reward": 0.74774169921875, + "reward_std": 0.015436086803674698, + "rewards//mean": 0.74774169921875, + "rewards//std": 0.04153744876384735, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0474, + "grad_norm": 2.815829038619995, + "kl": 0.1407718537375331, + "learning_rate": 9.965202274972286e-07, + "loss": 0.0141, + "num_tokens": 2049408.0, + "reward": 0.72216796875, + "reward_std": 0.012544216588139534, + "rewards//mean": 0.72216796875, + "rewards//std": 0.0369827039539814, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0476, + "grad_norm": 3.4487929344177246, + "kl": 0.15310040256008506, + "learning_rate": 9.964827540532684e-07, + "loss": 0.0153, + "num_tokens": 2058016.0, + "reward": 0.72662353515625, + "reward_std": 0.016283154487609863, + "rewards//mean": 0.72662353515625, + "rewards//std": 0.04449181258678436, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0478, + "grad_norm": 3.5169429779052734, + "kl": 0.1543840290978551, + "learning_rate": 9.964450806261144e-07, + "loss": 0.0154, + "num_tokens": 2066648.0, + "reward": 0.7509765625, + "reward_std": 0.01590714603662491, + "rewards//mean": 0.7509765625, + "rewards//std": 0.034973207861185074, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.048, + "grad_norm": 3.831986665725708, + "kl": 0.1407278785482049, + "learning_rate": 9.96407207230941e-07, + "loss": 0.0141, + "num_tokens": 2075336.0, + "reward": 0.7237548828125, + "reward_std": 0.012750649824738503, + "rewards//mean": 0.7237548828125, + "rewards//std": 0.040704984217882156, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0482, + "grad_norm": 3.1790924072265625, + "kl": 0.158443967346102, + "learning_rate": 9.963691338830042e-07, + "loss": 0.0158, + "num_tokens": 2083952.0, + "reward": 0.73724365234375, + "reward_std": 0.015838809311389923, + "rewards//mean": 0.73724365234375, + "rewards//std": 0.024312397465109825, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0484, + "grad_norm": 2.7173264026641846, + "kl": 0.15428494522348046, + "learning_rate": 9.963308605976396e-07, + "loss": 0.0154, + "num_tokens": 2092624.0, + "reward": 0.76165771484375, + "reward_std": 0.011489255353808403, + "rewards//mean": 0.76165771484375, + "rewards//std": 0.02435469999909401, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0486, + "grad_norm": 2.8586461544036865, + "kl": 0.1581531437113881, + "learning_rate": 9.962923873902636e-07, + "loss": 0.0158, + "num_tokens": 2101160.0, + "reward": 0.71075439453125, + "reward_std": 0.012413685210049152, + "rewards//mean": 0.71075439453125, + "rewards//std": 0.04495181515812874, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0488, + "grad_norm": 2.937629222869873, + "kl": 0.16615721164271235, + "learning_rate": 9.962537142763732e-07, + "loss": 0.0166, + "num_tokens": 2109792.0, + "reward": 0.7449951171875, + "reward_std": 0.012411234900355339, + "rewards//mean": 0.7449951171875, + "rewards//std": 0.024953732267022133, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.049, + "grad_norm": 3.2936151027679443, + "kl": 0.2063107956200838, + "learning_rate": 9.962148412715463e-07, + "loss": 0.0206, + "num_tokens": 2118552.0, + "reward": 0.75042724609375, + "reward_std": 0.010296858847141266, + "rewards//mean": 0.75042724609375, + "rewards//std": 0.03866437450051308, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0492, + "grad_norm": 3.623534917831421, + "kl": 0.1343393293209374, + "learning_rate": 9.961757683914405e-07, + "loss": 0.0134, + "num_tokens": 2127248.0, + "reward": 0.69366455078125, + "reward_std": 0.010699542239308357, + "rewards//mean": 0.69366455078125, + "rewards//std": 0.04388511925935745, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0494, + "grad_norm": 4.162594795227051, + "kl": 0.16983078233897686, + "learning_rate": 9.961364956517946e-07, + "loss": 0.017, + "num_tokens": 2135896.0, + "reward": 0.74365234375, + "reward_std": 0.016386723145842552, + "rewards//mean": 0.74365234375, + "rewards//std": 0.04790989309549332, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0496, + "grad_norm": 3.966630458831787, + "kl": 0.18166909040883183, + "learning_rate": 9.960970230684275e-07, + "loss": 0.0182, + "num_tokens": 2144536.0, + "reward": 0.72308349609375, + "reward_std": 0.014546409249305725, + "rewards//mean": 0.72308349609375, + "rewards//std": 0.05282726511359215, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0498, + "grad_norm": 2.8995301723480225, + "kl": 0.14150721998885274, + "learning_rate": 9.960573506572389e-07, + "loss": 0.0142, + "num_tokens": 2153104.0, + "reward": 0.74957275390625, + "reward_std": 0.011014558374881744, + "rewards//mean": 0.74957275390625, + "rewards//std": 0.024101028218865395, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.05, + "grad_norm": 3.355257511138916, + "kl": 0.19822023855522275, + "learning_rate": 9.960174784342087e-07, + "loss": 0.0198, + "num_tokens": 2161736.0, + "reward": 0.7359619140625, + "reward_std": 0.011475984007120132, + "rewards//mean": 0.7359619140625, + "rewards//std": 0.037581801414489746, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0502, + "grad_norm": 3.375710964202881, + "kl": 0.16241988725960255, + "learning_rate": 9.959774064153975e-07, + "loss": 0.0162, + "num_tokens": 2170344.0, + "reward": 0.71624755859375, + "reward_std": 0.009172160178422928, + "rewards//mean": 0.71624755859375, + "rewards//std": 0.03990514203906059, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0504, + "grad_norm": 3.359903573989868, + "kl": 0.17989000072702765, + "learning_rate": 9.959371346169465e-07, + "loss": 0.018, + "num_tokens": 2179056.0, + "reward": 0.783935546875, + "reward_std": 0.0145841920748353, + "rewards//mean": 0.783935546875, + "rewards//std": 0.030487943440675735, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0506, + "grad_norm": 3.0368056297302246, + "kl": 0.1985956854186952, + "learning_rate": 9.95896663055077e-07, + "loss": 0.0199, + "num_tokens": 2187640.0, + "reward": 0.709716796875, + "reward_std": 0.011430484242737293, + "rewards//mean": 0.709716796875, + "rewards//std": 0.04344732314348221, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0508, + "grad_norm": 2.8358800411224365, + "kl": 0.1734612863510847, + "learning_rate": 9.958559917460907e-07, + "loss": 0.0173, + "num_tokens": 2196336.0, + "reward": 0.75439453125, + "reward_std": 0.00935526005923748, + "rewards//mean": 0.75439453125, + "rewards//std": 0.03223055601119995, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.051, + "grad_norm": 3.5464892387390137, + "kl": 0.1810889858752489, + "learning_rate": 9.958151207063703e-07, + "loss": 0.0181, + "num_tokens": 2205024.0, + "reward": 0.74468994140625, + "reward_std": 0.012457584962248802, + "rewards//mean": 0.74468994140625, + "rewards//std": 0.03684404492378235, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0512, + "grad_norm": 5.931046485900879, + "kl": 0.27709746547043324, + "learning_rate": 9.957740499523785e-07, + "loss": 0.0277, + "num_tokens": 2213608.0, + "reward": 0.74755859375, + "reward_std": 0.013767718337476254, + "rewards//mean": 0.74755859375, + "rewards//std": 0.04670677334070206, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0514, + "grad_norm": 3.762895107269287, + "kl": 0.17110665002837777, + "learning_rate": 9.957327795006588e-07, + "loss": 0.0171, + "num_tokens": 2222264.0, + "reward": 0.77410888671875, + "reward_std": 0.013452369719743729, + "rewards//mean": 0.77410888671875, + "rewards//std": 0.04332830756902695, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0516, + "grad_norm": 4.586542129516602, + "kl": 0.2573170200921595, + "learning_rate": 9.956913093678348e-07, + "loss": 0.0257, + "num_tokens": 2230880.0, + "reward": 0.71356201171875, + "reward_std": 0.014903232455253601, + "rewards//mean": 0.71356201171875, + "rewards//std": 0.042870305478572845, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0518, + "grad_norm": 3.6989188194274902, + "kl": 0.19445497635751963, + "learning_rate": 9.956496395706105e-07, + "loss": 0.0194, + "num_tokens": 2239608.0, + "reward": 0.7647705078125, + "reward_std": 0.013435694389045238, + "rewards//mean": 0.7647705078125, + "rewards//std": 0.03896608203649521, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.052, + "grad_norm": 4.194847106933594, + "kl": 0.22157821152359247, + "learning_rate": 9.956077701257707e-07, + "loss": 0.0222, + "num_tokens": 2248296.0, + "reward": 0.7449951171875, + "reward_std": 0.013505147770047188, + "rewards//mean": 0.7449951171875, + "rewards//std": 0.026932932436466217, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0522, + "grad_norm": 4.072581768035889, + "kl": 0.2203886266797781, + "learning_rate": 9.955657010501806e-07, + "loss": 0.022, + "num_tokens": 2256976.0, + "reward": 0.7191162109375, + "reward_std": 0.0126027287915349, + "rewards//mean": 0.7191162109375, + "rewards//std": 0.03821449726819992, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0524, + "grad_norm": 4.167779922485352, + "kl": 0.3016416598111391, + "learning_rate": 9.955234323607851e-07, + "loss": 0.0302, + "num_tokens": 2265672.0, + "reward": 0.75775146484375, + "reward_std": 0.014491424895823002, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.04258937016129494, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0526, + "grad_norm": 3.6395280361175537, + "kl": 0.2628649156540632, + "learning_rate": 9.954809640746105e-07, + "loss": 0.0263, + "num_tokens": 2274336.0, + "reward": 0.76348876953125, + "reward_std": 0.012867321260273457, + "rewards//mean": 0.76348876953125, + "rewards//std": 0.039622291922569275, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0528, + "grad_norm": 3.4668288230895996, + "kl": 0.281156400218606, + "learning_rate": 9.954382962087627e-07, + "loss": 0.0281, + "num_tokens": 2282984.0, + "reward": 0.77264404296875, + "reward_std": 0.013112173415720463, + "rewards//mean": 0.77264404296875, + "rewards//std": 0.02909199893474579, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.053, + "grad_norm": 3.6522719860076904, + "kl": 0.31555200181901455, + "learning_rate": 9.953954287804284e-07, + "loss": 0.0316, + "num_tokens": 2291520.0, + "reward": 0.71893310546875, + "reward_std": 0.012379538267850876, + "rewards//mean": 0.71893310546875, + "rewards//std": 0.03527955710887909, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0532, + "grad_norm": 4.363163948059082, + "kl": 0.27224128041416407, + "learning_rate": 9.953523618068748e-07, + "loss": 0.0272, + "num_tokens": 2300080.0, + "reward": 0.72747802734375, + "reward_std": 0.01283281296491623, + "rewards//mean": 0.72747802734375, + "rewards//std": 0.029771430417895317, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0534, + "grad_norm": 4.9658308029174805, + "kl": 0.42545278184115887, + "learning_rate": 9.95309095305449e-07, + "loss": 0.0425, + "num_tokens": 2308656.0, + "reward": 0.7386474609375, + "reward_std": 0.01564006507396698, + "rewards//mean": 0.7386474609375, + "rewards//std": 0.03796651214361191, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0536, + "grad_norm": 3.5091545581817627, + "kl": 0.31685093883425, + "learning_rate": 9.952656292935788e-07, + "loss": 0.0317, + "num_tokens": 2317368.0, + "reward": 0.76239013671875, + "reward_std": 0.01158678624778986, + "rewards//mean": 0.76239013671875, + "rewards//std": 0.03563561290502548, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0538, + "grad_norm": 4.6175312995910645, + "kl": 0.38975627813488245, + "learning_rate": 9.952219637887725e-07, + "loss": 0.039, + "num_tokens": 2325992.0, + "reward": 0.736083984375, + "reward_std": 0.015487446449697018, + "rewards//mean": 0.736083984375, + "rewards//std": 0.03353699669241905, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.054, + "grad_norm": 3.726688861846924, + "kl": 0.3101207744330168, + "learning_rate": 9.951780988086183e-07, + "loss": 0.031, + "num_tokens": 2334616.0, + "reward": 0.76055908203125, + "reward_std": 0.011042140424251556, + "rewards//mean": 0.76055908203125, + "rewards//std": 0.029295260086655617, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0542, + "grad_norm": 4.996911525726318, + "kl": 0.3400790123268962, + "learning_rate": 9.95134034370785e-07, + "loss": 0.034, + "num_tokens": 2343264.0, + "reward": 0.74603271484375, + "reward_std": 0.013830387964844704, + "rewards//mean": 0.74603271484375, + "rewards//std": 0.03156227618455887, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0544, + "grad_norm": 3.765669584274292, + "kl": 0.4163520308211446, + "learning_rate": 9.95089770493022e-07, + "loss": 0.0416, + "num_tokens": 2351936.0, + "reward": 0.73028564453125, + "reward_std": 0.01131827849894762, + "rewards//mean": 0.73028564453125, + "rewards//std": 0.047021668404340744, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0546, + "grad_norm": 4.735191345214844, + "kl": 0.3753213444724679, + "learning_rate": 9.950453071931588e-07, + "loss": 0.0375, + "num_tokens": 2360560.0, + "reward": 0.74945068359375, + "reward_std": 0.013796941377222538, + "rewards//mean": 0.74945068359375, + "rewards//std": 0.0388689860701561, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0548, + "grad_norm": 3.7093846797943115, + "kl": 0.3756133262068033, + "learning_rate": 9.950006444891048e-07, + "loss": 0.0376, + "num_tokens": 2369160.0, + "reward": 0.70928955078125, + "reward_std": 0.013247143477201462, + "rewards//mean": 0.70928955078125, + "rewards//std": 0.037219706922769547, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.055, + "grad_norm": 4.347089767456055, + "kl": 0.49243751261383295, + "learning_rate": 9.949557823988506e-07, + "loss": 0.0492, + "num_tokens": 2377840.0, + "reward": 0.73736572265625, + "reward_std": 0.014801887795329094, + "rewards//mean": 0.73736572265625, + "rewards//std": 0.035927943885326385, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0552, + "grad_norm": 3.7578723430633545, + "kl": 0.6516364244744182, + "learning_rate": 9.949107209404663e-07, + "loss": 0.0652, + "num_tokens": 2386504.0, + "reward": 0.74981689453125, + "reward_std": 0.012601152062416077, + "rewards//mean": 0.74981689453125, + "rewards//std": 0.024660447612404823, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0554, + "grad_norm": 5.244663238525391, + "kl": 0.5927953533828259, + "learning_rate": 9.94865460132103e-07, + "loss": 0.0593, + "num_tokens": 2395128.0, + "reward": 0.74749755859375, + "reward_std": 0.02090274542570114, + "rewards//mean": 0.74749755859375, + "rewards//std": 0.03469066694378853, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0556, + "grad_norm": 3.8761532306671143, + "kl": 0.4448296641930938, + "learning_rate": 9.948199999919912e-07, + "loss": 0.0445, + "num_tokens": 2403824.0, + "reward": 0.71746826171875, + "reward_std": 0.010741055011749268, + "rewards//mean": 0.71746826171875, + "rewards//std": 0.03245935216546059, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0558, + "grad_norm": 4.406347751617432, + "kl": 0.6288035763427615, + "learning_rate": 9.947743405384428e-07, + "loss": 0.0629, + "num_tokens": 2412440.0, + "reward": 0.7379150390625, + "reward_std": 0.015601018443703651, + "rewards//mean": 0.7379150390625, + "rewards//std": 0.04014930874109268, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.056, + "grad_norm": 4.828309059143066, + "kl": 0.5129829635843635, + "learning_rate": 9.947284817898492e-07, + "loss": 0.0513, + "num_tokens": 2421072.0, + "reward": 0.70001220703125, + "reward_std": 0.014600463211536407, + "rewards//mean": 0.70001220703125, + "rewards//std": 0.03711136430501938, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0562, + "grad_norm": 5.15447473526001, + "kl": 0.46403655782341957, + "learning_rate": 9.946824237646824e-07, + "loss": 0.0464, + "num_tokens": 2429736.0, + "reward": 0.737060546875, + "reward_std": 0.013210458680987358, + "rewards//mean": 0.737060546875, + "rewards//std": 0.03836729750037193, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0564, + "grad_norm": 4.505428791046143, + "kl": 0.6311583276838064, + "learning_rate": 9.946361664814943e-07, + "loss": 0.0631, + "num_tokens": 2438336.0, + "reward": 0.73468017578125, + "reward_std": 0.014126582071185112, + "rewards//mean": 0.73468017578125, + "rewards//std": 0.022727172821760178, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0566, + "grad_norm": 3.6432912349700928, + "kl": 0.38745086546987295, + "learning_rate": 9.945897099589173e-07, + "loss": 0.0387, + "num_tokens": 2446944.0, + "reward": 0.722412109375, + "reward_std": 0.012055720202624798, + "rewards//mean": 0.722412109375, + "rewards//std": 0.030456148087978363, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0568, + "grad_norm": 3.8316993713378906, + "kl": 0.6913583185523748, + "learning_rate": 9.945430542156646e-07, + "loss": 0.0691, + "num_tokens": 2455528.0, + "reward": 0.75775146484375, + "reward_std": 0.013003588654100895, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.02943393401801586, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.057, + "grad_norm": 4.167176246643066, + "kl": 0.5367339439690113, + "learning_rate": 9.944961992705286e-07, + "loss": 0.0537, + "num_tokens": 2464104.0, + "reward": 0.7257080078125, + "reward_std": 0.010289316065609455, + "rewards//mean": 0.7257080078125, + "rewards//std": 0.04251524433493614, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0572, + "grad_norm": 4.4267401695251465, + "kl": 0.7689229855313897, + "learning_rate": 9.944491451423827e-07, + "loss": 0.0769, + "num_tokens": 2472768.0, + "reward": 0.77447509765625, + "reward_std": 0.012099775485694408, + "rewards//mean": 0.77447509765625, + "rewards//std": 0.025441773235797882, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0574, + "grad_norm": 5.860213279724121, + "kl": 0.6941496105864644, + "learning_rate": 9.944018918501805e-07, + "loss": 0.0694, + "num_tokens": 2481432.0, + "reward": 0.72503662109375, + "reward_std": 0.01605089195072651, + "rewards//mean": 0.72503662109375, + "rewards//std": 0.03806779906153679, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0576, + "grad_norm": 6.553688049316406, + "kl": 0.5049006678164005, + "learning_rate": 9.94354439412955e-07, + "loss": 0.0505, + "num_tokens": 2490120.0, + "reward": 0.73065185546875, + "reward_std": 0.013142431154847145, + "rewards//mean": 0.73065185546875, + "rewards//std": 0.025765178725123405, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0578, + "grad_norm": 4.863574504852295, + "kl": 0.3878084821626544, + "learning_rate": 9.943067878498209e-07, + "loss": 0.0388, + "num_tokens": 2498832.0, + "reward": 0.74432373046875, + "reward_std": 0.009785640053451061, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.03728553652763367, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.058, + "grad_norm": 4.903024673461914, + "kl": 0.39208444207906723, + "learning_rate": 9.942589371799714e-07, + "loss": 0.0392, + "num_tokens": 2507544.0, + "reward": 0.75457763671875, + "reward_std": 0.011581134051084518, + "rewards//mean": 0.75457763671875, + "rewards//std": 0.036779481917619705, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0582, + "grad_norm": 4.678225040435791, + "kl": 0.7103300355374813, + "learning_rate": 9.94210887422681e-07, + "loss": 0.071, + "num_tokens": 2516200.0, + "reward": 0.75054931640625, + "reward_std": 0.013709913939237595, + "rewards//mean": 0.75054931640625, + "rewards//std": 0.035935528576374054, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0584, + "grad_norm": 4.853988170623779, + "kl": 0.43904567416757345, + "learning_rate": 9.941626385973047e-07, + "loss": 0.0439, + "num_tokens": 2524768.0, + "reward": 0.75555419921875, + "reward_std": 0.011930609121918678, + "rewards//mean": 0.75555419921875, + "rewards//std": 0.025956004858016968, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0586, + "grad_norm": 5.283975601196289, + "kl": 0.34506158623844385, + "learning_rate": 9.941141907232763e-07, + "loss": 0.0345, + "num_tokens": 2533440.0, + "reward": 0.753173828125, + "reward_std": 0.009904064238071442, + "rewards//mean": 0.753173828125, + "rewards//std": 0.03344298154115677, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0588, + "grad_norm": 5.530971527099609, + "kl": 0.8286517476662993, + "learning_rate": 9.94065543820111e-07, + "loss": 0.0829, + "num_tokens": 2542128.0, + "reward": 0.77996826171875, + "reward_std": 0.01934969238936901, + "rewards//mean": 0.77996826171875, + "rewards//std": 0.02818603254854679, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.059, + "grad_norm": 5.333427429199219, + "kl": 1.1374772489070892, + "learning_rate": 9.94016697907404e-07, + "loss": 0.1137, + "num_tokens": 2550680.0, + "reward": 0.73394775390625, + "reward_std": 0.013834717683494091, + "rewards//mean": 0.73394775390625, + "rewards//std": 0.031025337055325508, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0592, + "grad_norm": 4.085994243621826, + "kl": 0.8266544556245208, + "learning_rate": 9.9396765300483e-07, + "loss": 0.0827, + "num_tokens": 2559424.0, + "reward": 0.759521484375, + "reward_std": 0.012676459737122059, + "rewards//mean": 0.759521484375, + "rewards//std": 0.039302803575992584, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0594, + "grad_norm": 4.988473892211914, + "kl": 1.1103221122175455, + "learning_rate": 9.939184091321444e-07, + "loss": 0.111, + "num_tokens": 2568040.0, + "reward": 0.7689208984375, + "reward_std": 0.015579704195261002, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.038413625210523605, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0596, + "grad_norm": 4.135498046875, + "kl": 0.9984888043254614, + "learning_rate": 9.938689663091827e-07, + "loss": 0.0998, + "num_tokens": 2576776.0, + "reward": 0.7529296875, + "reward_std": 0.01220305822789669, + "rewards//mean": 0.7529296875, + "rewards//std": 0.041240144520998, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0598, + "grad_norm": 4.1136369705200195, + "kl": 0.9260764308273792, + "learning_rate": 9.938193245558604e-07, + "loss": 0.0926, + "num_tokens": 2585392.0, + "reward": 0.71807861328125, + "reward_std": 0.011635358445346355, + "rewards//mean": 0.71807861328125, + "rewards//std": 0.04143089801073074, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.06, + "grad_norm": 5.337719917297363, + "kl": 0.7270172508433461, + "learning_rate": 9.937694838921733e-07, + "loss": 0.0727, + "num_tokens": 2594032.0, + "reward": 0.7274169921875, + "reward_std": 0.009733819402754307, + "rewards//mean": 0.7274169921875, + "rewards//std": 0.026634516194462776, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0602, + "grad_norm": 5.916358470916748, + "kl": 0.6537883328273892, + "learning_rate": 9.93719444338197e-07, + "loss": 0.0654, + "num_tokens": 2602736.0, + "reward": 0.72100830078125, + "reward_std": 0.01079651154577732, + "rewards//mean": 0.72100830078125, + "rewards//std": 0.047860853374004364, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0604, + "grad_norm": 4.627238750457764, + "kl": 0.9710520636290312, + "learning_rate": 9.936692059140878e-07, + "loss": 0.0971, + "num_tokens": 2611384.0, + "reward": 0.763671875, + "reward_std": 0.015687113627791405, + "rewards//mean": 0.763671875, + "rewards//std": 0.03544783964753151, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0606, + "grad_norm": 4.747879505157471, + "kl": 0.9860076494514942, + "learning_rate": 9.936187686400814e-07, + "loss": 0.0986, + "num_tokens": 2620152.0, + "reward": 0.7462158203125, + "reward_std": 0.015207450836896896, + "rewards//mean": 0.7462158203125, + "rewards//std": 0.040624577552080154, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0608, + "grad_norm": 4.995135307312012, + "kl": 0.5666801882907748, + "learning_rate": 9.93568132536494e-07, + "loss": 0.0567, + "num_tokens": 2628816.0, + "reward": 0.7508544921875, + "reward_std": 0.013964075595140457, + "rewards//mean": 0.7508544921875, + "rewards//std": 0.03339655324816704, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.061, + "grad_norm": 4.070781707763672, + "kl": 0.782807239331305, + "learning_rate": 9.935172976237217e-07, + "loss": 0.0783, + "num_tokens": 2637496.0, + "reward": 0.7484130859375, + "reward_std": 0.013928147032856941, + "rewards//mean": 0.7484130859375, + "rewards//std": 0.0234345942735672, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0612, + "grad_norm": 4.972671985626221, + "kl": 1.1988671775907278, + "learning_rate": 9.93466263922241e-07, + "loss": 0.1199, + "num_tokens": 2646104.0, + "reward": 0.751953125, + "reward_std": 0.014233101159334183, + "rewards//mean": 0.751953125, + "rewards//std": 0.03372514620423317, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0614, + "grad_norm": 4.157861709594727, + "kl": 0.9355712188407779, + "learning_rate": 9.934150314526083e-07, + "loss": 0.0936, + "num_tokens": 2654744.0, + "reward": 0.76129150390625, + "reward_std": 0.012417576275765896, + "rewards//mean": 0.76129150390625, + "rewards//std": 0.017401453107595444, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0616, + "grad_norm": 4.780778884887695, + "kl": 0.7273468747735023, + "learning_rate": 9.933636002354599e-07, + "loss": 0.0727, + "num_tokens": 2663376.0, + "reward": 0.7191162109375, + "reward_std": 0.010411866009235382, + "rewards//mean": 0.7191162109375, + "rewards//std": 0.023475898429751396, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0618, + "grad_norm": 5.462563514709473, + "kl": 1.1083982829004526, + "learning_rate": 9.933119702915124e-07, + "loss": 0.1108, + "num_tokens": 2671952.0, + "reward": 0.7236328125, + "reward_std": 0.015762878581881523, + "rewards//mean": 0.7236328125, + "rewards//std": 0.03274126723408699, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.062, + "grad_norm": 4.324944972991943, + "kl": 0.9199625449255109, + "learning_rate": 9.93260141641562e-07, + "loss": 0.092, + "num_tokens": 2680624.0, + "reward": 0.73858642578125, + "reward_std": 0.01111672818660736, + "rewards//mean": 0.73858642578125, + "rewards//std": 0.03486433997750282, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0622, + "grad_norm": 5.655622959136963, + "kl": 1.42093366663903, + "learning_rate": 9.932081143064858e-07, + "loss": 0.1421, + "num_tokens": 2689176.0, + "reward": 0.76385498046875, + "reward_std": 0.018049361184239388, + "rewards//mean": 0.76385498046875, + "rewards//std": 0.042472273111343384, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0624, + "grad_norm": 4.6508097648620605, + "kl": 0.7688910737633705, + "learning_rate": 9.931558883072402e-07, + "loss": 0.0769, + "num_tokens": 2697864.0, + "reward": 0.75213623046875, + "reward_std": 0.013499232940375805, + "rewards//mean": 0.75213623046875, + "rewards//std": 0.030488377436995506, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0626, + "grad_norm": 4.622182846069336, + "kl": 0.4912436017766595, + "learning_rate": 9.931034636648616e-07, + "loss": 0.0491, + "num_tokens": 2706480.0, + "reward": 0.71551513671875, + "reward_std": 0.012210061773657799, + "rewards//mean": 0.71551513671875, + "rewards//std": 0.021514564752578735, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0628, + "grad_norm": 4.870460510253906, + "kl": 0.8243315378203988, + "learning_rate": 9.930508404004666e-07, + "loss": 0.0824, + "num_tokens": 2715056.0, + "reward": 0.7562255859375, + "reward_std": 0.0113151203840971, + "rewards//mean": 0.7562255859375, + "rewards//std": 0.030510524287819862, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.063, + "grad_norm": 4.56314754486084, + "kl": 0.5322395460680127, + "learning_rate": 9.929980185352525e-07, + "loss": 0.0532, + "num_tokens": 2723632.0, + "reward": 0.7457275390625, + "reward_std": 0.008279968984425068, + "rewards//mean": 0.7457275390625, + "rewards//std": 0.03119351714849472, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0632, + "grad_norm": 4.371639728546143, + "kl": 0.6331577720120549, + "learning_rate": 9.929449980904951e-07, + "loss": 0.0633, + "num_tokens": 2732264.0, + "reward": 0.72454833984375, + "reward_std": 0.012990620918571949, + "rewards//mean": 0.72454833984375, + "rewards//std": 0.03534514456987381, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0634, + "grad_norm": 3.9564285278320312, + "kl": 1.0012456197291613, + "learning_rate": 9.928917790875516e-07, + "loss": 0.1001, + "num_tokens": 2740960.0, + "reward": 0.7568359375, + "reward_std": 0.012390851974487305, + "rewards//mean": 0.7568359375, + "rewards//std": 0.0322268009185791, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0636, + "grad_norm": 4.573144912719727, + "kl": 0.969984645023942, + "learning_rate": 9.928383615478586e-07, + "loss": 0.097, + "num_tokens": 2749528.0, + "reward": 0.74847412109375, + "reward_std": 0.010943949222564697, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.028634140267968178, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0638, + "grad_norm": 4.976900100708008, + "kl": 0.735985585488379, + "learning_rate": 9.927847454929322e-07, + "loss": 0.0736, + "num_tokens": 2758176.0, + "reward": 0.753662109375, + "reward_std": 0.01002872921526432, + "rewards//mean": 0.753662109375, + "rewards//std": 0.02685553953051567, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.064, + "grad_norm": 4.124001979827881, + "kl": 0.8998236870393157, + "learning_rate": 9.927309309443695e-07, + "loss": 0.09, + "num_tokens": 2766760.0, + "reward": 0.76275634765625, + "reward_std": 0.013413220643997192, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.035897597670555115, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0642, + "grad_norm": 6.085822582244873, + "kl": 1.1758588114753366, + "learning_rate": 9.926769179238464e-07, + "loss": 0.1176, + "num_tokens": 2775344.0, + "reward": 0.74359130859375, + "reward_std": 0.01115325279533863, + "rewards//mean": 0.74359130859375, + "rewards//std": 0.038590699434280396, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0644, + "grad_norm": 5.7256574630737305, + "kl": 0.8206800632178783, + "learning_rate": 9.926227064531199e-07, + "loss": 0.0821, + "num_tokens": 2783880.0, + "reward": 0.74560546875, + "reward_std": 0.014594745822250843, + "rewards//mean": 0.74560546875, + "rewards//std": 0.030405409634113312, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0646, + "grad_norm": 4.412210941314697, + "kl": 1.0538199730217457, + "learning_rate": 9.925682965540263e-07, + "loss": 0.1054, + "num_tokens": 2792480.0, + "reward": 0.74951171875, + "reward_std": 0.016410717740654945, + "rewards//mean": 0.74951171875, + "rewards//std": 0.036288533359766006, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0648, + "grad_norm": 6.5532002449035645, + "kl": 0.456143987365067, + "learning_rate": 9.925136882484815e-07, + "loss": 0.0456, + "num_tokens": 2801088.0, + "reward": 0.7254638671875, + "reward_std": 0.00946881715208292, + "rewards//mean": 0.7254638671875, + "rewards//std": 0.033526841551065445, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.065, + "grad_norm": 7.161509990692139, + "kl": 1.4719884041696787, + "learning_rate": 9.92458881558482e-07, + "loss": 0.1472, + "num_tokens": 2809768.0, + "reward": 0.74249267578125, + "reward_std": 0.012742443010210991, + "rewards//mean": 0.74249267578125, + "rewards//std": 0.03293528035283089, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0652, + "grad_norm": 4.279499530792236, + "kl": 1.221362279728055, + "learning_rate": 9.92403876506104e-07, + "loss": 0.1221, + "num_tokens": 2818328.0, + "reward": 0.75152587890625, + "reward_std": 0.015956521034240723, + "rewards//mean": 0.75152587890625, + "rewards//std": 0.03385858237743378, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0654, + "grad_norm": 5.586702823638916, + "kl": 1.3700648723170161, + "learning_rate": 9.923486731135033e-07, + "loss": 0.137, + "num_tokens": 2826984.0, + "reward": 0.7186279296875, + "reward_std": 0.020838936790823936, + "rewards//mean": 0.7186279296875, + "rewards//std": 0.059400323778390884, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0656, + "grad_norm": 4.409854888916016, + "kl": 1.432421556673944, + "learning_rate": 9.922932714029163e-07, + "loss": 0.1432, + "num_tokens": 2835544.0, + "reward": 0.75897216796875, + "reward_std": 0.014281929470598698, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.028811747208237648, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0658, + "grad_norm": 6.012087821960449, + "kl": 0.9793825000524521, + "learning_rate": 9.92237671396658e-07, + "loss": 0.0979, + "num_tokens": 2844184.0, + "reward": 0.76617431640625, + "reward_std": 0.01349995844066143, + "rewards//mean": 0.76617431640625, + "rewards//std": 0.028517598286271095, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.066, + "grad_norm": 4.234142780303955, + "kl": 0.9242843044921756, + "learning_rate": 9.921818731171248e-07, + "loss": 0.0924, + "num_tokens": 2852824.0, + "reward": 0.72479248046875, + "reward_std": 0.015175123699009418, + "rewards//mean": 0.72479248046875, + "rewards//std": 0.026584099978208542, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0662, + "grad_norm": 4.870720386505127, + "kl": 1.1872072061523795, + "learning_rate": 9.921258765867919e-07, + "loss": 0.1187, + "num_tokens": 2861568.0, + "reward": 0.755126953125, + "reward_std": 0.013591473922133446, + "rewards//mean": 0.755126953125, + "rewards//std": 0.0360499769449234, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0664, + "grad_norm": 5.395249366760254, + "kl": 0.9591629793867469, + "learning_rate": 9.920696818282147e-07, + "loss": 0.0959, + "num_tokens": 2870168.0, + "reward": 0.7657470703125, + "reward_std": 0.014035122469067574, + "rewards//mean": 0.7657470703125, + "rewards//std": 0.029528219252824783, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0666, + "grad_norm": 4.52071475982666, + "kl": 1.1475577987730503, + "learning_rate": 9.920132888640284e-07, + "loss": 0.1148, + "num_tokens": 2878752.0, + "reward": 0.72406005859375, + "reward_std": 0.013598522171378136, + "rewards//mean": 0.72406005859375, + "rewards//std": 0.03329412639141083, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0668, + "grad_norm": 5.5705790519714355, + "kl": 1.3797461157664657, + "learning_rate": 9.919566977169485e-07, + "loss": 0.138, + "num_tokens": 2887504.0, + "reward": 0.7239990234375, + "reward_std": 0.016014471650123596, + "rewards//mean": 0.7239990234375, + "rewards//std": 0.03900180757045746, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.067, + "grad_norm": 4.385342597961426, + "kl": 1.0792835243046284, + "learning_rate": 9.918999084097694e-07, + "loss": 0.1079, + "num_tokens": 2896176.0, + "reward": 0.70703125, + "reward_std": 0.01298388373106718, + "rewards//mean": 0.70703125, + "rewards//std": 0.02993578463792801, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0672, + "grad_norm": 5.2752838134765625, + "kl": 1.3719432950019836, + "learning_rate": 9.91842920965366e-07, + "loss": 0.1372, + "num_tokens": 2904808.0, + "reward": 0.7362060546875, + "reward_std": 0.014960775151848793, + "rewards//mean": 0.7362060546875, + "rewards//std": 0.03218143805861473, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0674, + "grad_norm": 4.5603251457214355, + "kl": 0.9280480965971947, + "learning_rate": 9.91785735406693e-07, + "loss": 0.0928, + "num_tokens": 2913544.0, + "reward": 0.723876953125, + "reward_std": 0.01555250771343708, + "rewards//mean": 0.723876953125, + "rewards//std": 0.037711478769779205, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0676, + "grad_norm": 5.770708084106445, + "kl": 0.7606811327859759, + "learning_rate": 9.917283517567843e-07, + "loss": 0.0761, + "num_tokens": 2922208.0, + "reward": 0.7403564453125, + "reward_std": 0.013976898044347763, + "rewards//mean": 0.7403564453125, + "rewards//std": 0.028704695403575897, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0678, + "grad_norm": 4.471171855926514, + "kl": 1.3800368467345834, + "learning_rate": 9.916707700387545e-07, + "loss": 0.138, + "num_tokens": 2930904.0, + "reward": 0.73065185546875, + "reward_std": 0.01717003807425499, + "rewards//mean": 0.73065185546875, + "rewards//std": 0.038735173642635345, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.068, + "grad_norm": 4.665797233581543, + "kl": 1.1380757903680205, + "learning_rate": 9.916129902757974e-07, + "loss": 0.1138, + "num_tokens": 2939600.0, + "reward": 0.75115966796875, + "reward_std": 0.010834988206624985, + "rewards//mean": 0.75115966796875, + "rewards//std": 0.0265179630368948, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0682, + "grad_norm": 7.105188846588135, + "kl": 0.8124034851789474, + "learning_rate": 9.915550124911866e-07, + "loss": 0.0812, + "num_tokens": 2948432.0, + "reward": 0.7431640625, + "reward_std": 0.012510347180068493, + "rewards//mean": 0.7431640625, + "rewards//std": 0.025976480916142464, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0684, + "grad_norm": 6.090428352355957, + "kl": 1.9907669760286808, + "learning_rate": 9.914968367082755e-07, + "loss": 0.1991, + "num_tokens": 2957032.0, + "reward": 0.76763916015625, + "reward_std": 0.016988936811685562, + "rewards//mean": 0.76763916015625, + "rewards//std": 0.028450636193156242, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0686, + "grad_norm": 4.496386528015137, + "kl": 1.0567503031343222, + "learning_rate": 9.914384629504973e-07, + "loss": 0.1057, + "num_tokens": 2965680.0, + "reward": 0.7489013671875, + "reward_std": 0.016614051535725594, + "rewards//mean": 0.7489013671875, + "rewards//std": 0.038578782230615616, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0688, + "grad_norm": 5.6869330406188965, + "kl": 1.4211051985621452, + "learning_rate": 9.913798912413652e-07, + "loss": 0.1421, + "num_tokens": 2974304.0, + "reward": 0.701171875, + "reward_std": 0.014944510534405708, + "rewards//mean": 0.701171875, + "rewards//std": 0.044792965054512024, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.069, + "grad_norm": 6.705982208251953, + "kl": 1.7800831086933613, + "learning_rate": 9.913211216044713e-07, + "loss": 0.178, + "num_tokens": 2982920.0, + "reward": 0.7130126953125, + "reward_std": 0.013807592913508415, + "rewards//mean": 0.7130126953125, + "rewards//std": 0.044018279761075974, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0692, + "grad_norm": 5.911066055297852, + "kl": 0.8889193050563335, + "learning_rate": 9.912621540634886e-07, + "loss": 0.0889, + "num_tokens": 2991640.0, + "reward": 0.7552490234375, + "reward_std": 0.012438319623470306, + "rewards//mean": 0.7552490234375, + "rewards//std": 0.027963140979409218, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0694, + "grad_norm": 4.615927219390869, + "kl": 1.5478127505630255, + "learning_rate": 9.91202988642169e-07, + "loss": 0.1548, + "num_tokens": 3000224.0, + "reward": 0.74017333984375, + "reward_std": 0.02050355263054371, + "rewards//mean": 0.74017333984375, + "rewards//std": 0.04133981838822365, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0696, + "grad_norm": 4.6172404289245605, + "kl": 1.6639066198840737, + "learning_rate": 9.911436253643443e-07, + "loss": 0.1664, + "num_tokens": 3008880.0, + "reward": 0.727783203125, + "reward_std": 0.01631959155201912, + "rewards//mean": 0.727783203125, + "rewards//std": 0.034024547785520554, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0698, + "grad_norm": 7.348176002502441, + "kl": 1.608157278969884, + "learning_rate": 9.91084064253926e-07, + "loss": 0.1608, + "num_tokens": 3017536.0, + "reward": 0.72088623046875, + "reward_std": 0.012965100817382336, + "rewards//mean": 0.72088623046875, + "rewards//std": 0.03418118134140968, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.07, + "grad_norm": 11.13676929473877, + "kl": 2.0739471651613712, + "learning_rate": 9.910243053349055e-07, + "loss": 0.2074, + "num_tokens": 3026160.0, + "reward": 0.75830078125, + "reward_std": 0.015678374096751213, + "rewards//mean": 0.75830078125, + "rewards//std": 0.033886346966028214, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0702, + "grad_norm": 4.341561317443848, + "kl": 0.7875896524637938, + "learning_rate": 9.909643486313533e-07, + "loss": 0.0788, + "num_tokens": 3034792.0, + "reward": 0.77484130859375, + "reward_std": 0.012139599770307541, + "rewards//mean": 0.77484130859375, + "rewards//std": 0.025107571855187416, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0704, + "grad_norm": 34.89895248413086, + "kl": 1.1917068948969245, + "learning_rate": 9.909041941674204e-07, + "loss": 0.1192, + "num_tokens": 3043432.0, + "reward": 0.7481689453125, + "reward_std": 0.014628879725933075, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.029618307948112488, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0706, + "grad_norm": 3.9406888484954834, + "kl": 1.3195801004767418, + "learning_rate": 9.908438419673366e-07, + "loss": 0.132, + "num_tokens": 3052008.0, + "reward": 0.75390625, + "reward_std": 0.01344769075512886, + "rewards//mean": 0.75390625, + "rewards//std": 0.02626393362879753, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0708, + "grad_norm": 5.148041248321533, + "kl": 1.028293943963945, + "learning_rate": 9.90783292055412e-07, + "loss": 0.1028, + "num_tokens": 3060680.0, + "reward": 0.76177978515625, + "reward_std": 0.012960381805896759, + "rewards//mean": 0.76177978515625, + "rewards//std": 0.031191756948828697, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.071, + "grad_norm": 6.117336273193359, + "kl": 1.357841451652348, + "learning_rate": 9.907225444560361e-07, + "loss": 0.1358, + "num_tokens": 3069312.0, + "reward": 0.761474609375, + "reward_std": 0.019020909443497658, + "rewards//mean": 0.761474609375, + "rewards//std": 0.03357308730483055, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0712, + "grad_norm": 4.669897556304932, + "kl": 0.9688333803787827, + "learning_rate": 9.90661599193678e-07, + "loss": 0.0969, + "num_tokens": 3078032.0, + "reward": 0.74676513671875, + "reward_std": 0.014538027346134186, + "rewards//mean": 0.74676513671875, + "rewards//std": 0.02792271412909031, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0714, + "grad_norm": 4.874142169952393, + "kl": 1.1960312463343143, + "learning_rate": 9.906004562928863e-07, + "loss": 0.1196, + "num_tokens": 3086656.0, + "reward": 0.7301025390625, + "reward_std": 0.01636132039129734, + "rewards//mean": 0.7301025390625, + "rewards//std": 0.025767896324396133, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0716, + "grad_norm": 4.525518894195557, + "kl": 0.942520503886044, + "learning_rate": 9.905391157782897e-07, + "loss": 0.0943, + "num_tokens": 3095184.0, + "reward": 0.74371337890625, + "reward_std": 0.013057741336524487, + "rewards//mean": 0.74371337890625, + "rewards//std": 0.03897321969270706, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0718, + "grad_norm": 4.909975528717041, + "kl": 0.9185402356088161, + "learning_rate": 9.904775776745956e-07, + "loss": 0.0919, + "num_tokens": 3103768.0, + "reward": 0.77825927734375, + "reward_std": 0.012431308627128601, + "rewards//mean": 0.77825927734375, + "rewards//std": 0.034370649605989456, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.072, + "grad_norm": 6.116695880889893, + "kl": 0.7805115794762969, + "learning_rate": 9.904158420065922e-07, + "loss": 0.0781, + "num_tokens": 3112464.0, + "reward": 0.76300048828125, + "reward_std": 0.012985588982701302, + "rewards//mean": 0.76300048828125, + "rewards//std": 0.03321173042058945, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0722, + "grad_norm": 4.855072021484375, + "kl": 1.4850094048306346, + "learning_rate": 9.903539087991461e-07, + "loss": 0.1485, + "num_tokens": 3121000.0, + "reward": 0.70574951171875, + "reward_std": 0.013892744667828083, + "rewards//mean": 0.70574951171875, + "rewards//std": 0.045972708612680435, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0724, + "grad_norm": 4.527280330657959, + "kl": 1.5802220031619072, + "learning_rate": 9.902917780772042e-07, + "loss": 0.158, + "num_tokens": 3129608.0, + "reward": 0.7613525390625, + "reward_std": 0.019554516300559044, + "rewards//mean": 0.7613525390625, + "rewards//std": 0.03348527476191521, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0726, + "grad_norm": 4.968642234802246, + "kl": 1.7012907210737467, + "learning_rate": 9.902294498657929e-07, + "loss": 0.1701, + "num_tokens": 3138360.0, + "reward": 0.74285888671875, + "reward_std": 0.015753204002976418, + "rewards//mean": 0.74285888671875, + "rewards//std": 0.04993502423167229, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0728, + "grad_norm": 5.95297384262085, + "kl": 1.2917128959670663, + "learning_rate": 9.901669241900176e-07, + "loss": 0.1292, + "num_tokens": 3146912.0, + "reward": 0.73187255859375, + "reward_std": 0.012270906008780003, + "rewards//mean": 0.73187255859375, + "rewards//std": 0.035742923617362976, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.073, + "grad_norm": 6.7177581787109375, + "kl": 1.2866203812882304, + "learning_rate": 9.90104201075064e-07, + "loss": 0.1287, + "num_tokens": 3155568.0, + "reward": 0.712890625, + "reward_std": 0.01278286799788475, + "rewards//mean": 0.712890625, + "rewards//std": 0.04508937895298004, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0732, + "grad_norm": 4.225528717041016, + "kl": 1.4042665641754866, + "learning_rate": 9.900412805461966e-07, + "loss": 0.1404, + "num_tokens": 3164336.0, + "reward": 0.75274658203125, + "reward_std": 0.014368729665875435, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.040323637425899506, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0734, + "grad_norm": 6.365389823913574, + "kl": 2.0204008100554347, + "learning_rate": 9.899781626287602e-07, + "loss": 0.202, + "num_tokens": 3173144.0, + "reward": 0.7510986328125, + "reward_std": 0.012239251285791397, + "rewards//mean": 0.7510986328125, + "rewards//std": 0.0335485078394413, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0736, + "grad_norm": 3.669586420059204, + "kl": 0.9549193810671568, + "learning_rate": 9.899148473481784e-07, + "loss": 0.0955, + "num_tokens": 3181768.0, + "reward": 0.7335205078125, + "reward_std": 0.010605204850435257, + "rewards//mean": 0.7335205078125, + "rewards//std": 0.03359900414943695, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0738, + "grad_norm": 5.0829691886901855, + "kl": 1.339660044759512, + "learning_rate": 9.898513347299547e-07, + "loss": 0.134, + "num_tokens": 3190400.0, + "reward": 0.70855712890625, + "reward_std": 0.01506077405065298, + "rewards//mean": 0.70855712890625, + "rewards//std": 0.04746353626251221, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.074, + "grad_norm": 4.2541422843933105, + "kl": 0.9790180828422308, + "learning_rate": 9.89787624799672e-07, + "loss": 0.0979, + "num_tokens": 3199032.0, + "reward": 0.76324462890625, + "reward_std": 0.011934377253055573, + "rewards//mean": 0.76324462890625, + "rewards//std": 0.03299635276198387, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0742, + "grad_norm": 4.490905284881592, + "kl": 1.3595623094588518, + "learning_rate": 9.897237175829926e-07, + "loss": 0.136, + "num_tokens": 3207600.0, + "reward": 0.74859619140625, + "reward_std": 0.016955040395259857, + "rewards//mean": 0.74859619140625, + "rewards//std": 0.03239446505904198, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0744, + "grad_norm": 6.865668773651123, + "kl": 1.6440453492105007, + "learning_rate": 9.896596131056582e-07, + "loss": 0.1644, + "num_tokens": 3216240.0, + "reward": 0.73419189453125, + "reward_std": 0.012222332879900932, + "rewards//mean": 0.73419189453125, + "rewards//std": 0.037753552198410034, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0746, + "grad_norm": 4.67140007019043, + "kl": 1.8705146927386522, + "learning_rate": 9.895953113934903e-07, + "loss": 0.1871, + "num_tokens": 3224856.0, + "reward": 0.77838134765625, + "reward_std": 0.022047007456421852, + "rewards//mean": 0.77838134765625, + "rewards//std": 0.035951532423496246, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0748, + "grad_norm": 4.3647847175598145, + "kl": 1.3531591948121786, + "learning_rate": 9.895308124723896e-07, + "loss": 0.1353, + "num_tokens": 3233608.0, + "reward": 0.76177978515625, + "reward_std": 0.016040973365306854, + "rewards//mean": 0.76177978515625, + "rewards//std": 0.037015385925769806, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.075, + "grad_norm": 7.428527355194092, + "kl": 1.886878363788128, + "learning_rate": 9.89466116368336e-07, + "loss": 0.1887, + "num_tokens": 3242224.0, + "reward": 0.714111328125, + "reward_std": 0.013848571106791496, + "rewards//mean": 0.714111328125, + "rewards//std": 0.051243580877780914, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0752, + "grad_norm": 5.991532802581787, + "kl": 1.8202048065140843, + "learning_rate": 9.894012231073895e-07, + "loss": 0.182, + "num_tokens": 3250880.0, + "reward": 0.74603271484375, + "reward_std": 0.016969073563814163, + "rewards//mean": 0.74603271484375, + "rewards//std": 0.04127238690853119, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0754, + "grad_norm": 4.161983013153076, + "kl": 1.292853050865233, + "learning_rate": 9.893361327156884e-07, + "loss": 0.1293, + "num_tokens": 3259592.0, + "reward": 0.74908447265625, + "reward_std": 0.01443011499941349, + "rewards//mean": 0.74908447265625, + "rewards//std": 0.037323277443647385, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0756, + "grad_norm": 4.299761772155762, + "kl": 1.0473097246140242, + "learning_rate": 9.89270845219452e-07, + "loss": 0.1047, + "num_tokens": 3268216.0, + "reward": 0.76910400390625, + "reward_std": 0.018593017011880875, + "rewards//mean": 0.76910400390625, + "rewards//std": 0.04182399809360504, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0758, + "grad_norm": 6.520134925842285, + "kl": 1.0033248355612159, + "learning_rate": 9.892053606449774e-07, + "loss": 0.1003, + "num_tokens": 3276832.0, + "reward": 0.7694091796875, + "reward_std": 0.01282799057662487, + "rewards//mean": 0.7694091796875, + "rewards//std": 0.03313809260725975, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.076, + "grad_norm": 4.847512245178223, + "kl": 1.791846677660942, + "learning_rate": 9.891396790186422e-07, + "loss": 0.1792, + "num_tokens": 3285440.0, + "reward": 0.7230224609375, + "reward_std": 0.014002474956214428, + "rewards//mean": 0.7230224609375, + "rewards//std": 0.0384167805314064, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0762, + "grad_norm": 6.757416725158691, + "kl": 1.1047533452510834, + "learning_rate": 9.890738003669027e-07, + "loss": 0.1105, + "num_tokens": 3294056.0, + "reward": 0.76422119140625, + "reward_std": 0.012958530336618423, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.04590053856372833, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0764, + "grad_norm": 6.155483245849609, + "kl": 1.2366701336577535, + "learning_rate": 9.89007724716295e-07, + "loss": 0.1237, + "num_tokens": 3302832.0, + "reward": 0.7449951171875, + "reward_std": 0.011507006362080574, + "rewards//mean": 0.7449951171875, + "rewards//std": 0.03625493869185448, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0766, + "grad_norm": 4.060118675231934, + "kl": 1.4494256637990475, + "learning_rate": 9.889414520934343e-07, + "loss": 0.1449, + "num_tokens": 3311488.0, + "reward": 0.74951171875, + "reward_std": 0.0157524012029171, + "rewards//mean": 0.74951171875, + "rewards//std": 0.02822420373558998, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0768, + "grad_norm": 5.359792232513428, + "kl": 1.7442060075700283, + "learning_rate": 9.88874982525015e-07, + "loss": 0.1744, + "num_tokens": 3320128.0, + "reward": 0.71612548828125, + "reward_std": 0.018574664369225502, + "rewards//mean": 0.71612548828125, + "rewards//std": 0.050058554857969284, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.077, + "grad_norm": 4.171280860900879, + "kl": 0.897304117679596, + "learning_rate": 9.888083160378112e-07, + "loss": 0.0897, + "num_tokens": 3328744.0, + "reward": 0.73583984375, + "reward_std": 0.010158386081457138, + "rewards//mean": 0.73583984375, + "rewards//std": 0.02818985842168331, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0772, + "grad_norm": 4.775302886962891, + "kl": 1.5411655902862549, + "learning_rate": 9.887414526586763e-07, + "loss": 0.1541, + "num_tokens": 3337352.0, + "reward": 0.73626708984375, + "reward_std": 0.014250718057155609, + "rewards//mean": 0.73626708984375, + "rewards//std": 0.034910768270492554, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0774, + "grad_norm": 4.278979301452637, + "kl": 1.5506847016513348, + "learning_rate": 9.886743924145426e-07, + "loss": 0.1551, + "num_tokens": 3345952.0, + "reward": 0.77008056640625, + "reward_std": 0.0142056904733181, + "rewards//mean": 0.77008056640625, + "rewards//std": 0.03488908335566521, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0776, + "grad_norm": 5.347935676574707, + "kl": 1.5720333755016327, + "learning_rate": 9.886071353324222e-07, + "loss": 0.1572, + "num_tokens": 3354552.0, + "reward": 0.72705078125, + "reward_std": 0.014094813726842403, + "rewards//mean": 0.72705078125, + "rewards//std": 0.03180694952607155, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0778, + "grad_norm": 5.136808395385742, + "kl": 1.092930156737566, + "learning_rate": 9.88539681439406e-07, + "loss": 0.1093, + "num_tokens": 3363192.0, + "reward": 0.73883056640625, + "reward_std": 0.015286346897482872, + "rewards//mean": 0.73883056640625, + "rewards//std": 0.0341452918946743, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.078, + "grad_norm": 6.12586784362793, + "kl": 1.5505319805815816, + "learning_rate": 9.884720307626646e-07, + "loss": 0.1551, + "num_tokens": 3371832.0, + "reward": 0.741455078125, + "reward_std": 0.015485553070902824, + "rewards//mean": 0.741455078125, + "rewards//std": 0.04268811270594597, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0782, + "grad_norm": 3.5782203674316406, + "kl": 1.7768533248454332, + "learning_rate": 9.884041833294475e-07, + "loss": 0.1777, + "num_tokens": 3380448.0, + "reward": 0.76104736328125, + "reward_std": 0.022384842857718468, + "rewards//mean": 0.76104736328125, + "rewards//std": 0.03959553688764572, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0784, + "grad_norm": 7.191686153411865, + "kl": 1.9574782270938158, + "learning_rate": 9.883361391670839e-07, + "loss": 0.1957, + "num_tokens": 3389040.0, + "reward": 0.74920654296875, + "reward_std": 0.01802128367125988, + "rewards//mean": 0.74920654296875, + "rewards//std": 0.03243042528629303, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0786, + "grad_norm": 3.984739303588867, + "kl": 2.038462040014565, + "learning_rate": 9.882678983029817e-07, + "loss": 0.2038, + "num_tokens": 3397736.0, + "reward": 0.73284912109375, + "reward_std": 0.018375638872385025, + "rewards//mean": 0.73284912109375, + "rewards//std": 0.03866124153137207, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0788, + "grad_norm": 5.110982894897461, + "kl": 1.4865666590631008, + "learning_rate": 9.881994607646286e-07, + "loss": 0.1487, + "num_tokens": 3406328.0, + "reward": 0.74188232421875, + "reward_std": 0.010195759125053883, + "rewards//mean": 0.74188232421875, + "rewards//std": 0.016952887177467346, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.079, + "grad_norm": 7.007421016693115, + "kl": 2.300824441947043, + "learning_rate": 9.881308265795911e-07, + "loss": 0.2301, + "num_tokens": 3414960.0, + "reward": 0.7236328125, + "reward_std": 0.01930246688425541, + "rewards//mean": 0.7236328125, + "rewards//std": 0.04316278174519539, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0792, + "grad_norm": 4.596628665924072, + "kl": 1.854134103283286, + "learning_rate": 9.88061995775515e-07, + "loss": 0.1854, + "num_tokens": 3423696.0, + "reward": 0.7176513671875, + "reward_std": 0.014456256292760372, + "rewards//mean": 0.7176513671875, + "rewards//std": 0.042821768671274185, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0794, + "grad_norm": 4.418922424316406, + "kl": 1.2172086499631405, + "learning_rate": 9.879929683801253e-07, + "loss": 0.1217, + "num_tokens": 3432248.0, + "reward": 0.75030517578125, + "reward_std": 0.014133303426206112, + "rewards//mean": 0.75030517578125, + "rewards//std": 0.03540205955505371, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0796, + "grad_norm": 5.769720077514648, + "kl": 1.0407563131302595, + "learning_rate": 9.879237444212264e-07, + "loss": 0.1041, + "num_tokens": 3440928.0, + "reward": 0.75732421875, + "reward_std": 0.01361837238073349, + "rewards//mean": 0.75732421875, + "rewards//std": 0.026960179209709167, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0798, + "grad_norm": 7.309990882873535, + "kl": 2.549136446788907, + "learning_rate": 9.878543239267014e-07, + "loss": 0.2549, + "num_tokens": 3449560.0, + "reward": 0.73712158203125, + "reward_std": 0.01825561933219433, + "rewards//mean": 0.73712158203125, + "rewards//std": 0.03929543495178223, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.08, + "grad_norm": 3.9757120609283447, + "kl": 1.892622048035264, + "learning_rate": 9.877847069245133e-07, + "loss": 0.1893, + "num_tokens": 3458264.0, + "reward": 0.76141357421875, + "reward_std": 0.020533859729766846, + "rewards//mean": 0.76141357421875, + "rewards//std": 0.03574249893426895, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0802, + "grad_norm": 8.19530200958252, + "kl": 2.073295334354043, + "learning_rate": 9.877148934427035e-07, + "loss": 0.2073, + "num_tokens": 3466896.0, + "reward": 0.76507568359375, + "reward_std": 0.015231217257678509, + "rewards//mean": 0.76507568359375, + "rewards//std": 0.03705707564949989, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0804, + "grad_norm": 8.765624046325684, + "kl": 1.6512914411723614, + "learning_rate": 9.876448835093929e-07, + "loss": 0.1651, + "num_tokens": 3475584.0, + "reward": 0.758544921875, + "reward_std": 0.012856746092438698, + "rewards//mean": 0.758544921875, + "rewards//std": 0.03643092140555382, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0806, + "grad_norm": 6.105319976806641, + "kl": 1.8445893814787269, + "learning_rate": 9.875746771527815e-07, + "loss": 0.1845, + "num_tokens": 3484184.0, + "reward": 0.7689208984375, + "reward_std": 0.01619412750005722, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.0371377170085907, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0808, + "grad_norm": 4.7534074783325195, + "kl": 1.8480807561427355, + "learning_rate": 9.875042744011486e-07, + "loss": 0.1848, + "num_tokens": 3492760.0, + "reward": 0.76025390625, + "reward_std": 0.015435409732162952, + "rewards//mean": 0.76025390625, + "rewards//std": 0.02669837884604931, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.081, + "grad_norm": 8.251901626586914, + "kl": 2.2260224148631096, + "learning_rate": 9.874336752828522e-07, + "loss": 0.2226, + "num_tokens": 3501344.0, + "reward": 0.74627685546875, + "reward_std": 0.020439930260181427, + "rewards//mean": 0.74627685546875, + "rewards//std": 0.03999646008014679, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0812, + "grad_norm": 9.156527519226074, + "kl": 2.4917550943791866, + "learning_rate": 9.873628798263295e-07, + "loss": 0.2492, + "num_tokens": 3510008.0, + "reward": 0.74578857421875, + "reward_std": 0.02122437208890915, + "rewards//mean": 0.74578857421875, + "rewards//std": 0.04651349037885666, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0814, + "grad_norm": 5.503023147583008, + "kl": 1.6605557333678007, + "learning_rate": 9.872918880600973e-07, + "loss": 0.1661, + "num_tokens": 3518752.0, + "reward": 0.7576904296875, + "reward_std": 0.013971546664834023, + "rewards//mean": 0.7576904296875, + "rewards//std": 0.02513265423476696, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0816, + "grad_norm": 7.537412166595459, + "kl": 1.1888005854561925, + "learning_rate": 9.87220700012751e-07, + "loss": 0.1189, + "num_tokens": 3527400.0, + "reward": 0.75775146484375, + "reward_std": 0.015135394409298897, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.04103698208928108, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0818, + "grad_norm": 6.6156439781188965, + "kl": 2.0969483722001314, + "learning_rate": 9.871493157129647e-07, + "loss": 0.2097, + "num_tokens": 3536064.0, + "reward": 0.73748779296875, + "reward_std": 0.017476029694080353, + "rewards//mean": 0.73748779296875, + "rewards//std": 0.03361133486032486, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.082, + "grad_norm": 7.873403072357178, + "kl": 2.726026590913534, + "learning_rate": 9.870777351894926e-07, + "loss": 0.2726, + "num_tokens": 3544736.0, + "reward": 0.70843505859375, + "reward_std": 0.01589493826031685, + "rewards//mean": 0.70843505859375, + "rewards//std": 0.03425506129860878, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0822, + "grad_norm": 12.662691116333008, + "kl": 2.877569567412138, + "learning_rate": 9.870059584711668e-07, + "loss": 0.2878, + "num_tokens": 3553424.0, + "reward": 0.72467041015625, + "reward_std": 0.01846178248524666, + "rewards//mean": 0.72467041015625, + "rewards//std": 0.041550204157829285, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0824, + "grad_norm": 8.252856254577637, + "kl": 2.546204186975956, + "learning_rate": 9.869339855868991e-07, + "loss": 0.2546, + "num_tokens": 3562040.0, + "reward": 0.71368408203125, + "reward_std": 0.017232369631528854, + "rewards//mean": 0.71368408203125, + "rewards//std": 0.039239924401044846, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0826, + "grad_norm": 8.939577102661133, + "kl": 2.673120856285095, + "learning_rate": 9.868618165656804e-07, + "loss": 0.2673, + "num_tokens": 3570696.0, + "reward": 0.73443603515625, + "reward_std": 0.01580619066953659, + "rewards//mean": 0.73443603515625, + "rewards//std": 0.03585202246904373, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0828, + "grad_norm": 8.235445976257324, + "kl": 1.9868406672030687, + "learning_rate": 9.8678945143658e-07, + "loss": 0.1987, + "num_tokens": 3579256.0, + "reward": 0.721435546875, + "reward_std": 0.012468339875340462, + "rewards//mean": 0.721435546875, + "rewards//std": 0.03478484973311424, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.083, + "grad_norm": 9.347323417663574, + "kl": 2.8131296895444393, + "learning_rate": 9.86716890228747e-07, + "loss": 0.2813, + "num_tokens": 3587928.0, + "reward": 0.707275390625, + "reward_std": 0.016024313867092133, + "rewards//mean": 0.707275390625, + "rewards//std": 0.04698273912072182, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0832, + "grad_norm": 4.233506202697754, + "kl": 1.4603004241362214, + "learning_rate": 9.866441329714087e-07, + "loss": 0.146, + "num_tokens": 3596568.0, + "reward": 0.74725341796875, + "reward_std": 0.015837572515010834, + "rewards//mean": 0.74725341796875, + "rewards//std": 0.026224061846733093, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0834, + "grad_norm": 4.584789752960205, + "kl": 2.1177571155130863, + "learning_rate": 9.86571179693872e-07, + "loss": 0.2118, + "num_tokens": 3605248.0, + "reward": 0.75885009765625, + "reward_std": 0.020106647163629532, + "rewards//mean": 0.75885009765625, + "rewards//std": 0.035422153770923615, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0836, + "grad_norm": 12.561141967773438, + "kl": 2.6870868876576424, + "learning_rate": 9.86498030425522e-07, + "loss": 0.2687, + "num_tokens": 3613840.0, + "reward": 0.73394775390625, + "reward_std": 0.022721827030181885, + "rewards//mean": 0.73394775390625, + "rewards//std": 0.04849086329340935, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0838, + "grad_norm": 6.3230881690979, + "kl": 2.121968600898981, + "learning_rate": 9.864246851958237e-07, + "loss": 0.2122, + "num_tokens": 3622464.0, + "reward": 0.744384765625, + "reward_std": 0.01947307400405407, + "rewards//mean": 0.744384765625, + "rewards//std": 0.04458090662956238, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.084, + "grad_norm": 5.679402828216553, + "kl": 1.2928848881274462, + "learning_rate": 9.863511440343205e-07, + "loss": 0.1293, + "num_tokens": 3631032.0, + "reward": 0.73162841796875, + "reward_std": 0.016452208161354065, + "rewards//mean": 0.73162841796875, + "rewards//std": 0.037684522569179535, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0842, + "grad_norm": 7.331523418426514, + "kl": 2.0377153968438506, + "learning_rate": 9.862774069706345e-07, + "loss": 0.2038, + "num_tokens": 3639672.0, + "reward": 0.72991943359375, + "reward_std": 0.016138438135385513, + "rewards//mean": 0.72991943359375, + "rewards//std": 0.04227185249328613, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0844, + "grad_norm": 5.33541202545166, + "kl": 1.6475609578192234, + "learning_rate": 9.862034740344671e-07, + "loss": 0.1648, + "num_tokens": 3648336.0, + "reward": 0.758056640625, + "reward_std": 0.020520765334367752, + "rewards//mean": 0.758056640625, + "rewards//std": 0.037414874881505966, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0846, + "grad_norm": 4.5698628425598145, + "kl": 1.597356203943491, + "learning_rate": 9.861293452555986e-07, + "loss": 0.1597, + "num_tokens": 3656896.0, + "reward": 0.77569580078125, + "reward_std": 0.02331678941845894, + "rewards//mean": 0.77569580078125, + "rewards//std": 0.04867874085903168, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0848, + "grad_norm": 7.015743732452393, + "kl": 1.2671599444001913, + "learning_rate": 9.86055020663888e-07, + "loss": 0.1267, + "num_tokens": 3665504.0, + "reward": 0.72161865234375, + "reward_std": 0.01877153106033802, + "rewards//mean": 0.72161865234375, + "rewards//std": 0.027535663917660713, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.085, + "grad_norm": 5.365900993347168, + "kl": 1.874972129240632, + "learning_rate": 9.859805002892731e-07, + "loss": 0.1875, + "num_tokens": 3674080.0, + "reward": 0.71783447265625, + "reward_std": 0.015492199920117855, + "rewards//mean": 0.71783447265625, + "rewards//std": 0.04610784351825714, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0852, + "grad_norm": 4.0368499755859375, + "kl": 1.5739709567278624, + "learning_rate": 9.859057841617708e-07, + "loss": 0.1574, + "num_tokens": 3682640.0, + "reward": 0.735595703125, + "reward_std": 0.01890283264219761, + "rewards//mean": 0.735595703125, + "rewards//std": 0.04131568968296051, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0854, + "grad_norm": 3.79864764213562, + "kl": 1.468534404411912, + "learning_rate": 9.858308723114768e-07, + "loss": 0.1469, + "num_tokens": 3691304.0, + "reward": 0.7694091796875, + "reward_std": 0.01469217799603939, + "rewards//mean": 0.7694091796875, + "rewards//std": 0.032962214201688766, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0856, + "grad_norm": 4.733910083770752, + "kl": 1.266563430428505, + "learning_rate": 9.857557647685655e-07, + "loss": 0.1267, + "num_tokens": 3700024.0, + "reward": 0.74017333984375, + "reward_std": 0.013550288043916225, + "rewards//mean": 0.74017333984375, + "rewards//std": 0.04085064306855202, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0858, + "grad_norm": 5.201381683349609, + "kl": 1.3382756058126688, + "learning_rate": 9.856804615632901e-07, + "loss": 0.1338, + "num_tokens": 3708784.0, + "reward": 0.73822021484375, + "reward_std": 0.015281138941645622, + "rewards//mean": 0.73822021484375, + "rewards//std": 0.04256092756986618, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.086, + "grad_norm": 4.525814056396484, + "kl": 1.0764458682388067, + "learning_rate": 9.856049627259832e-07, + "loss": 0.1076, + "num_tokens": 3717392.0, + "reward": 0.76513671875, + "reward_std": 0.0113562922924757, + "rewards//mean": 0.76513671875, + "rewards//std": 0.030325647443532944, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0862, + "grad_norm": 3.9547278881073, + "kl": 1.5455809328705072, + "learning_rate": 9.85529268287055e-07, + "loss": 0.1546, + "num_tokens": 3726008.0, + "reward": 0.7393798828125, + "reward_std": 0.016390426084399223, + "rewards//mean": 0.7393798828125, + "rewards//std": 0.033620622009038925, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0864, + "grad_norm": 6.644905090332031, + "kl": 1.1177070429548621, + "learning_rate": 9.854533782769959e-07, + "loss": 0.1118, + "num_tokens": 3734584.0, + "reward": 0.73309326171875, + "reward_std": 0.019686318933963776, + "rewards//mean": 0.73309326171875, + "rewards//std": 0.03657808154821396, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0866, + "grad_norm": 7.502194881439209, + "kl": 1.0910164881497622, + "learning_rate": 9.853772927263739e-07, + "loss": 0.1091, + "num_tokens": 3743296.0, + "reward": 0.77508544921875, + "reward_std": 0.02143547311425209, + "rewards//mean": 0.77508544921875, + "rewards//std": 0.042630936950445175, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0868, + "grad_norm": 5.1703057289123535, + "kl": 1.7078630905598402, + "learning_rate": 9.853010116658366e-07, + "loss": 0.1708, + "num_tokens": 3752000.0, + "reward": 0.73388671875, + "reward_std": 0.01481359638273716, + "rewards//mean": 0.73388671875, + "rewards//std": 0.03354873135685921, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.087, + "grad_norm": 5.223759651184082, + "kl": 1.200061284005642, + "learning_rate": 9.852245351261097e-07, + "loss": 0.12, + "num_tokens": 3760584.0, + "reward": 0.71185302734375, + "reward_std": 0.013629972003400326, + "rewards//mean": 0.71185302734375, + "rewards//std": 0.047321073710918427, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0872, + "grad_norm": 6.04955530166626, + "kl": 1.1332805044949055, + "learning_rate": 9.851478631379982e-07, + "loss": 0.1133, + "num_tokens": 3769168.0, + "reward": 0.735595703125, + "reward_std": 0.013865230605006218, + "rewards//mean": 0.735595703125, + "rewards//std": 0.037858907133340836, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0874, + "grad_norm": 4.328035831451416, + "kl": 1.4628506265580654, + "learning_rate": 9.850709957323854e-07, + "loss": 0.1463, + "num_tokens": 3777808.0, + "reward": 0.73638916015625, + "reward_std": 0.016544152051210403, + "rewards//mean": 0.73638916015625, + "rewards//std": 0.03291091322898865, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0876, + "grad_norm": 6.915759563446045, + "kl": 1.7527340091764927, + "learning_rate": 9.849939329402336e-07, + "loss": 0.1753, + "num_tokens": 3786392.0, + "reward": 0.7340087890625, + "reward_std": 0.024360811337828636, + "rewards//mean": 0.7340087890625, + "rewards//std": 0.03484637290239334, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0878, + "grad_norm": 5.501615524291992, + "kl": 1.0954801924526691, + "learning_rate": 9.849166747925834e-07, + "loss": 0.1095, + "num_tokens": 3795064.0, + "reward": 0.76043701171875, + "reward_std": 0.018962692469358444, + "rewards//mean": 0.76043701171875, + "rewards//std": 0.03665084391832352, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.088, + "grad_norm": 5.874672889709473, + "kl": 1.0125903934240341, + "learning_rate": 9.848392213205547e-07, + "loss": 0.1013, + "num_tokens": 3803632.0, + "reward": 0.74578857421875, + "reward_std": 0.01319526880979538, + "rewards//mean": 0.74578857421875, + "rewards//std": 0.044539760798215866, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0882, + "grad_norm": 6.564806938171387, + "kl": 2.0192187782377005, + "learning_rate": 9.847615725553455e-07, + "loss": 0.2019, + "num_tokens": 3812296.0, + "reward": 0.7786865234375, + "reward_std": 0.01627446338534355, + "rewards//mean": 0.7786865234375, + "rewards//std": 0.030171260237693787, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0884, + "grad_norm": 4.285877704620361, + "kl": 1.43904594425112, + "learning_rate": 9.84683728528233e-07, + "loss": 0.1439, + "num_tokens": 3820952.0, + "reward": 0.74932861328125, + "reward_std": 0.013798095285892487, + "rewards//mean": 0.74932861328125, + "rewards//std": 0.03864205256104469, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0886, + "grad_norm": 5.550078392028809, + "kl": 2.14881101436913, + "learning_rate": 9.846056892705727e-07, + "loss": 0.2149, + "num_tokens": 3829712.0, + "reward": 0.701416015625, + "reward_std": 0.019971946254372597, + "rewards//mean": 0.701416015625, + "rewards//std": 0.045945413410663605, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0888, + "grad_norm": 5.137343406677246, + "kl": 1.162135485559702, + "learning_rate": 9.845274548137985e-07, + "loss": 0.1162, + "num_tokens": 3838296.0, + "reward": 0.72705078125, + "reward_std": 0.012567806988954544, + "rewards//mean": 0.72705078125, + "rewards//std": 0.01751773990690708, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.089, + "grad_norm": 5.510349750518799, + "kl": 1.5951459687203169, + "learning_rate": 9.844490251894236e-07, + "loss": 0.1595, + "num_tokens": 3846960.0, + "reward": 0.7442626953125, + "reward_std": 0.01350520271807909, + "rewards//mean": 0.7442626953125, + "rewards//std": 0.03690710663795471, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0892, + "grad_norm": 4.13537073135376, + "kl": 1.2297241613268852, + "learning_rate": 9.843704004290392e-07, + "loss": 0.123, + "num_tokens": 3855688.0, + "reward": 0.74462890625, + "reward_std": 0.009682174772024155, + "rewards//mean": 0.74462890625, + "rewards//std": 0.035038936883211136, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0894, + "grad_norm": 4.711252212524414, + "kl": 1.6171795912086964, + "learning_rate": 9.842915805643156e-07, + "loss": 0.1617, + "num_tokens": 3864312.0, + "reward": 0.75726318359375, + "reward_std": 0.016375333070755005, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.03374572843313217, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0896, + "grad_norm": 6.262650489807129, + "kl": 1.0032986970618367, + "learning_rate": 9.84212565627001e-07, + "loss": 0.1003, + "num_tokens": 3872992.0, + "reward": 0.7371826171875, + "reward_std": 0.01670708693563938, + "rewards//mean": 0.7371826171875, + "rewards//std": 0.030293432995676994, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0898, + "grad_norm": 5.053429126739502, + "kl": 1.1675552818924189, + "learning_rate": 9.841333556489232e-07, + "loss": 0.1168, + "num_tokens": 3881712.0, + "reward": 0.76287841796875, + "reward_std": 0.012518439441919327, + "rewards//mean": 0.76287841796875, + "rewards//std": 0.03140215948224068, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.09, + "grad_norm": 5.673282146453857, + "kl": 1.2919608671218157, + "learning_rate": 9.840539506619872e-07, + "loss": 0.1292, + "num_tokens": 3890312.0, + "reward": 0.7432861328125, + "reward_std": 0.012161493301391602, + "rewards//mean": 0.7432861328125, + "rewards//std": 0.03512846678495407, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0902, + "grad_norm": 3.9209952354431152, + "kl": 1.5129262786358595, + "learning_rate": 9.83974350698178e-07, + "loss": 0.1513, + "num_tokens": 3899016.0, + "reward": 0.7255859375, + "reward_std": 0.01528351753950119, + "rewards//mean": 0.7255859375, + "rewards//std": 0.0511353462934494, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0904, + "grad_norm": 5.624383926391602, + "kl": 1.4455570131540298, + "learning_rate": 9.838945557895584e-07, + "loss": 0.1446, + "num_tokens": 3907752.0, + "reward": 0.76983642578125, + "reward_std": 0.015880919992923737, + "rewards//mean": 0.76983642578125, + "rewards//std": 0.027342019602656364, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0906, + "grad_norm": 9.07286262512207, + "kl": 1.7261950299143791, + "learning_rate": 9.838145659682692e-07, + "loss": 0.1726, + "num_tokens": 3916496.0, + "reward": 0.77105712890625, + "reward_std": 0.009812546893954277, + "rewards//mean": 0.77105712890625, + "rewards//std": 0.03369365260004997, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0908, + "grad_norm": 7.850827693939209, + "kl": 2.364481531083584, + "learning_rate": 9.83734381266531e-07, + "loss": 0.2364, + "num_tokens": 3925008.0, + "reward": 0.72412109375, + "reward_std": 0.01726219430565834, + "rewards//mean": 0.72412109375, + "rewards//std": 0.041582297533750534, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.091, + "grad_norm": 5.526845932006836, + "kl": 1.9436415508389473, + "learning_rate": 9.836540017166419e-07, + "loss": 0.1944, + "num_tokens": 3933696.0, + "reward": 0.7244873046875, + "reward_std": 0.014361506327986717, + "rewards//mean": 0.7244873046875, + "rewards//std": 0.035631630569696426, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0912, + "grad_norm": 5.372095584869385, + "kl": 1.3827466797083616, + "learning_rate": 9.835734273509785e-07, + "loss": 0.1383, + "num_tokens": 3942304.0, + "reward": 0.77215576171875, + "reward_std": 0.01689828932285309, + "rewards//mean": 0.77215576171875, + "rewards//std": 0.029504306614398956, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0914, + "grad_norm": 6.444041728973389, + "kl": 1.6388468109071255, + "learning_rate": 9.834926582019966e-07, + "loss": 0.1639, + "num_tokens": 3950968.0, + "reward": 0.74346923828125, + "reward_std": 0.015370641835033894, + "rewards//mean": 0.74346923828125, + "rewards//std": 0.03465355932712555, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0916, + "grad_norm": 5.43826150894165, + "kl": 1.4846863243728876, + "learning_rate": 9.834116943022297e-07, + "loss": 0.1485, + "num_tokens": 3959640.0, + "reward": 0.7340087890625, + "reward_std": 0.015341941267251968, + "rewards//mean": 0.7340087890625, + "rewards//std": 0.03952307254076004, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0918, + "grad_norm": 6.569565773010254, + "kl": 2.233142463490367, + "learning_rate": 9.8333053568429e-07, + "loss": 0.2233, + "num_tokens": 3968256.0, + "reward": 0.7635498046875, + "reward_std": 0.01592983305454254, + "rewards//mean": 0.7635498046875, + "rewards//std": 0.030249428004026413, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.092, + "grad_norm": 11.522577285766602, + "kl": 2.4209519093856215, + "learning_rate": 9.832491823808686e-07, + "loss": 0.2421, + "num_tokens": 3977024.0, + "reward": 0.75799560546875, + "reward_std": 0.015631375834345818, + "rewards//mean": 0.75799560546875, + "rewards//std": 0.0318368598818779, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0922, + "grad_norm": 5.3075480461120605, + "kl": 2.1683051753789186, + "learning_rate": 9.831676344247342e-07, + "loss": 0.2168, + "num_tokens": 3985664.0, + "reward": 0.71514892578125, + "reward_std": 0.01411198079586029, + "rewards//mean": 0.71514892578125, + "rewards//std": 0.03985010087490082, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0924, + "grad_norm": 4.104161262512207, + "kl": 1.1081998217850924, + "learning_rate": 9.830858918487346e-07, + "loss": 0.1108, + "num_tokens": 3994480.0, + "reward": 0.75677490234375, + "reward_std": 0.01155187003314495, + "rewards//mean": 0.75677490234375, + "rewards//std": 0.03030760958790779, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0926, + "grad_norm": 6.832763195037842, + "kl": 2.357150062918663, + "learning_rate": 9.830039546857952e-07, + "loss": 0.2357, + "num_tokens": 4003112.0, + "reward": 0.72491455078125, + "reward_std": 0.020855844020843506, + "rewards//mean": 0.72491455078125, + "rewards//std": 0.051110733300447464, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0928, + "grad_norm": 5.7236175537109375, + "kl": 1.966838133521378, + "learning_rate": 9.829218229689209e-07, + "loss": 0.1967, + "num_tokens": 4011720.0, + "reward": 0.7279052734375, + "reward_std": 0.022397905588150024, + "rewards//mean": 0.7279052734375, + "rewards//std": 0.047888245433568954, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.093, + "grad_norm": 5.766880035400391, + "kl": 1.824003990739584, + "learning_rate": 9.828394967311938e-07, + "loss": 0.1824, + "num_tokens": 4020384.0, + "reward": 0.7261962890625, + "reward_std": 0.016093678772449493, + "rewards//mean": 0.7261962890625, + "rewards//std": 0.045839034020900726, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0932, + "grad_norm": 4.534486293792725, + "kl": 1.870192615315318, + "learning_rate": 9.827569760057754e-07, + "loss": 0.187, + "num_tokens": 4029016.0, + "reward": 0.75372314453125, + "reward_std": 0.016078153625130653, + "rewards//mean": 0.75372314453125, + "rewards//std": 0.02742273360490799, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0934, + "grad_norm": 7.217731952667236, + "kl": 1.1967949345707893, + "learning_rate": 9.826742608259047e-07, + "loss": 0.1197, + "num_tokens": 4037688.0, + "reward": 0.7689208984375, + "reward_std": 0.015779195353388786, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.03237839788198471, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0936, + "grad_norm": 5.504461288452148, + "kl": 2.1448816806077957, + "learning_rate": 9.825913512248995e-07, + "loss": 0.2145, + "num_tokens": 4046304.0, + "reward": 0.734375, + "reward_std": 0.01801781728863716, + "rewards//mean": 0.734375, + "rewards//std": 0.029626740142703056, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0938, + "grad_norm": 4.837132453918457, + "kl": 2.0805433094501495, + "learning_rate": 9.825082472361556e-07, + "loss": 0.2081, + "num_tokens": 4054992.0, + "reward": 0.70660400390625, + "reward_std": 0.01611001417040825, + "rewards//mean": 0.70660400390625, + "rewards//std": 0.041034769266843796, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.094, + "grad_norm": 5.750391960144043, + "kl": 0.7883851593360305, + "learning_rate": 9.824249488931475e-07, + "loss": 0.0788, + "num_tokens": 4063624.0, + "reward": 0.74493408203125, + "reward_std": 0.009561952203512192, + "rewards//mean": 0.74493408203125, + "rewards//std": 0.027793388813734055, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0942, + "grad_norm": 4.606777191162109, + "kl": 0.9498005639761686, + "learning_rate": 9.82341456229428e-07, + "loss": 0.095, + "num_tokens": 4072256.0, + "reward": 0.77325439453125, + "reward_std": 0.0117096658796072, + "rewards//mean": 0.77325439453125, + "rewards//std": 0.031346190720796585, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0944, + "grad_norm": 5.636474132537842, + "kl": 1.744721632450819, + "learning_rate": 9.822577692786272e-07, + "loss": 0.1745, + "num_tokens": 4080920.0, + "reward": 0.74176025390625, + "reward_std": 0.016624407842755318, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.030600877478718758, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0946, + "grad_norm": 6.466523170471191, + "kl": 2.1718217339366674, + "learning_rate": 9.821738880744547e-07, + "loss": 0.2172, + "num_tokens": 4089496.0, + "reward": 0.7296142578125, + "reward_std": 0.019219931215047836, + "rewards//mean": 0.7296142578125, + "rewards//std": 0.04213757440447807, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0948, + "grad_norm": 4.043447971343994, + "kl": 1.4767930824309587, + "learning_rate": 9.820898126506979e-07, + "loss": 0.1477, + "num_tokens": 4098072.0, + "reward": 0.7376708984375, + "reward_std": 0.02094002068042755, + "rewards//mean": 0.7376708984375, + "rewards//std": 0.03580961748957634, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.095, + "grad_norm": 7.309907913208008, + "kl": 1.1765703689306974, + "learning_rate": 9.820055430412219e-07, + "loss": 0.1177, + "num_tokens": 4106672.0, + "reward": 0.705078125, + "reward_std": 0.014730259776115417, + "rewards//mean": 0.705078125, + "rewards//std": 0.05473177134990692, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0952, + "grad_norm": 6.035542011260986, + "kl": 1.0523865576833487, + "learning_rate": 9.81921079279971e-07, + "loss": 0.1052, + "num_tokens": 4115384.0, + "reward": 0.74176025390625, + "reward_std": 0.0114174485206604, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.026172636076807976, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0954, + "grad_norm": 5.155995845794678, + "kl": 1.7892052046954632, + "learning_rate": 9.81836421400967e-07, + "loss": 0.1789, + "num_tokens": 4124064.0, + "reward": 0.73919677734375, + "reward_std": 0.014689266681671143, + "rewards//mean": 0.73919677734375, + "rewards//std": 0.039943818002939224, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0956, + "grad_norm": 5.833273410797119, + "kl": 1.76816301420331, + "learning_rate": 9.817515694383102e-07, + "loss": 0.1768, + "num_tokens": 4132704.0, + "reward": 0.738037109375, + "reward_std": 0.026206418871879578, + "rewards//mean": 0.738037109375, + "rewards//std": 0.05074487254023552, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0958, + "grad_norm": 3.475642681121826, + "kl": 1.7420800495892763, + "learning_rate": 9.816665234261786e-07, + "loss": 0.1742, + "num_tokens": 4141304.0, + "reward": 0.7596435546875, + "reward_std": 0.015064358711242676, + "rewards//mean": 0.7596435546875, + "rewards//std": 0.03991486132144928, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.096, + "grad_norm": 4.941133499145508, + "kl": 1.792766097933054, + "learning_rate": 9.81581283398829e-07, + "loss": 0.1793, + "num_tokens": 4149840.0, + "reward": 0.764404296875, + "reward_std": 0.020246122032403946, + "rewards//mean": 0.764404296875, + "rewards//std": 0.04281841218471527, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0962, + "grad_norm": 6.210538387298584, + "kl": 1.294305069372058, + "learning_rate": 9.814958493905962e-07, + "loss": 0.1294, + "num_tokens": 4158448.0, + "reward": 0.71954345703125, + "reward_std": 0.01249985583126545, + "rewards//mean": 0.71954345703125, + "rewards//std": 0.03835659101605415, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0964, + "grad_norm": 8.487665176391602, + "kl": 1.755100229755044, + "learning_rate": 9.814102214358926e-07, + "loss": 0.1755, + "num_tokens": 4167072.0, + "reward": 0.73297119140625, + "reward_std": 0.011209162883460522, + "rewards//mean": 0.73297119140625, + "rewards//std": 0.04069247841835022, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0966, + "grad_norm": 5.701470375061035, + "kl": 1.4351928923279047, + "learning_rate": 9.813243995692097e-07, + "loss": 0.1435, + "num_tokens": 4175656.0, + "reward": 0.76092529296875, + "reward_std": 0.01819439046084881, + "rewards//mean": 0.76092529296875, + "rewards//std": 0.035633064806461334, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0968, + "grad_norm": 6.038917541503906, + "kl": 1.1271718349307775, + "learning_rate": 9.81238383825116e-07, + "loss": 0.1127, + "num_tokens": 4184232.0, + "reward": 0.7322998046875, + "reward_std": 0.013266301713883877, + "rewards//mean": 0.7322998046875, + "rewards//std": 0.02969997003674507, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.097, + "grad_norm": 8.786454200744629, + "kl": 1.132724966853857, + "learning_rate": 9.81152174238259e-07, + "loss": 0.1133, + "num_tokens": 4192904.0, + "reward": 0.7191162109375, + "reward_std": 0.010953761637210846, + "rewards//mean": 0.7191162109375, + "rewards//std": 0.03941875696182251, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0972, + "grad_norm": 6.407580852508545, + "kl": 2.186642337590456, + "learning_rate": 9.810657708433635e-07, + "loss": 0.2187, + "num_tokens": 4201520.0, + "reward": 0.7320556640625, + "reward_std": 0.01584211178123951, + "rewards//mean": 0.7320556640625, + "rewards//std": 0.032352205365896225, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0974, + "grad_norm": 5.325110912322998, + "kl": 1.8596960436552763, + "learning_rate": 9.809791736752332e-07, + "loss": 0.186, + "num_tokens": 4210128.0, + "reward": 0.75628662109375, + "reward_std": 0.012583386152982712, + "rewards//mean": 0.75628662109375, + "rewards//std": 0.03423561155796051, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0976, + "grad_norm": 3.9019405841827393, + "kl": 1.585777211934328, + "learning_rate": 9.808923827687492e-07, + "loss": 0.1586, + "num_tokens": 4218696.0, + "reward": 0.7193603515625, + "reward_std": 0.013766135089099407, + "rewards//mean": 0.7193603515625, + "rewards//std": 0.036146216094493866, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0978, + "grad_norm": 6.00661563873291, + "kl": 1.2772408034652472, + "learning_rate": 9.80805398158871e-07, + "loss": 0.1277, + "num_tokens": 4227304.0, + "reward": 0.76727294921875, + "reward_std": 0.02176658809185028, + "rewards//mean": 0.76727294921875, + "rewards//std": 0.04012569040060043, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.098, + "grad_norm": 5.746229648590088, + "kl": 2.446054134517908, + "learning_rate": 9.80718219880636e-07, + "loss": 0.2446, + "num_tokens": 4235968.0, + "reward": 0.7274169921875, + "reward_std": 0.0224794652312994, + "rewards//mean": 0.7274169921875, + "rewards//std": 0.04227529838681221, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0982, + "grad_norm": 4.4794697761535645, + "kl": 1.500032465904951, + "learning_rate": 9.806308479691594e-07, + "loss": 0.15, + "num_tokens": 4244616.0, + "reward": 0.73822021484375, + "reward_std": 0.014269332401454449, + "rewards//mean": 0.73822021484375, + "rewards//std": 0.03540334478020668, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0984, + "grad_norm": 4.810574531555176, + "kl": 1.8452752772718668, + "learning_rate": 9.805432824596347e-07, + "loss": 0.1845, + "num_tokens": 4253168.0, + "reward": 0.71533203125, + "reward_std": 0.015700260177254677, + "rewards//mean": 0.71533203125, + "rewards//std": 0.029907453805208206, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0986, + "grad_norm": 4.90366268157959, + "kl": 1.7375615183264017, + "learning_rate": 9.804555233873332e-07, + "loss": 0.1738, + "num_tokens": 4261768.0, + "reward": 0.7515869140625, + "reward_std": 0.013618829660117626, + "rewards//mean": 0.7515869140625, + "rewards//std": 0.032613176852464676, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0988, + "grad_norm": 6.374400615692139, + "kl": 1.3192815203219652, + "learning_rate": 9.803675707876048e-07, + "loss": 0.1319, + "num_tokens": 4270328.0, + "reward": 0.72662353515625, + "reward_std": 0.011379792355000973, + "rewards//mean": 0.72662353515625, + "rewards//std": 0.038935136049985886, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.099, + "grad_norm": 6.761526584625244, + "kl": 1.7310762237757444, + "learning_rate": 9.80279424695876e-07, + "loss": 0.1731, + "num_tokens": 4279016.0, + "reward": 0.7420654296875, + "reward_std": 0.019092299044132233, + "rewards//mean": 0.7420654296875, + "rewards//std": 0.03185616806149483, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0992, + "grad_norm": 4.999700546264648, + "kl": 1.366264495998621, + "learning_rate": 9.801910851476524e-07, + "loss": 0.1366, + "num_tokens": 4287632.0, + "reward": 0.7283935546875, + "reward_std": 0.012258632108569145, + "rewards//mean": 0.7283935546875, + "rewards//std": 0.04211745038628578, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0994, + "grad_norm": 10.183734893798828, + "kl": 1.0184291638433933, + "learning_rate": 9.80102552178517e-07, + "loss": 0.1018, + "num_tokens": 4296256.0, + "reward": 0.72607421875, + "reward_std": 0.010940630920231342, + "rewards//mean": 0.72607421875, + "rewards//std": 0.029712455347180367, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0996, + "grad_norm": 4.5478925704956055, + "kl": 1.484832838177681, + "learning_rate": 9.800138258241309e-07, + "loss": 0.1485, + "num_tokens": 4304944.0, + "reward": 0.75970458984375, + "reward_std": 0.01730983518064022, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.04647018760442734, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.0998, + "grad_norm": 7.538711071014404, + "kl": 1.5070508643984795, + "learning_rate": 9.799249061202334e-07, + "loss": 0.1507, + "num_tokens": 4313560.0, + "reward": 0.7239990234375, + "reward_std": 0.011308427900075912, + "rewards//mean": 0.7239990234375, + "rewards//std": 0.038109779357910156, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1, + "grad_norm": 8.800752639770508, + "kl": 1.108166430145502, + "learning_rate": 9.798357931026412e-07, + "loss": 0.1108, + "num_tokens": 4322208.0, + "reward": 0.75616455078125, + "reward_std": 0.01142922230064869, + "rewards//mean": 0.75616455078125, + "rewards//std": 0.027733413502573967, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1002, + "grad_norm": 6.367660999298096, + "kl": 2.1931713595986366, + "learning_rate": 9.797464868072486e-07, + "loss": 0.2193, + "num_tokens": 4330888.0, + "reward": 0.740966796875, + "reward_std": 0.017630886286497116, + "rewards//mean": 0.740966796875, + "rewards//std": 0.03819647058844566, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1004, + "grad_norm": 8.521171569824219, + "kl": 1.3009831812232733, + "learning_rate": 9.796569872700287e-07, + "loss": 0.1301, + "num_tokens": 4339480.0, + "reward": 0.72418212890625, + "reward_std": 0.015889937058091164, + "rewards//mean": 0.72418212890625, + "rewards//std": 0.04660874977707863, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1006, + "grad_norm": 6.808535575866699, + "kl": 2.014758253470063, + "learning_rate": 9.795672945270316e-07, + "loss": 0.2015, + "num_tokens": 4348208.0, + "reward": 0.70208740234375, + "reward_std": 0.014105882495641708, + "rewards//mean": 0.70208740234375, + "rewards//std": 0.04658601060509682, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1008, + "grad_norm": 8.12624454498291, + "kl": 1.96061559766531, + "learning_rate": 9.794774086143857e-07, + "loss": 0.1961, + "num_tokens": 4356904.0, + "reward": 0.74176025390625, + "reward_std": 0.016262296587228775, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.034033387899398804, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.101, + "grad_norm": 5.099590301513672, + "kl": 1.2936958279460669, + "learning_rate": 9.79387329568297e-07, + "loss": 0.1294, + "num_tokens": 4365528.0, + "reward": 0.74072265625, + "reward_std": 0.01925729401409626, + "rewards//mean": 0.74072265625, + "rewards//std": 0.0370873399078846, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1012, + "grad_norm": 5.462556838989258, + "kl": 1.5399411581456661, + "learning_rate": 9.792970574250493e-07, + "loss": 0.154, + "num_tokens": 4374120.0, + "reward": 0.71966552734375, + "reward_std": 0.01657002419233322, + "rewards//mean": 0.71966552734375, + "rewards//std": 0.03408760949969292, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1014, + "grad_norm": 5.555461406707764, + "kl": 1.4309385670349002, + "learning_rate": 9.79206592221004e-07, + "loss": 0.1431, + "num_tokens": 4382808.0, + "reward": 0.74737548828125, + "reward_std": 0.018372822552919388, + "rewards//mean": 0.74737548828125, + "rewards//std": 0.03450646996498108, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1016, + "grad_norm": 10.545476913452148, + "kl": 2.4767883997410536, + "learning_rate": 9.791159339926008e-07, + "loss": 0.2477, + "num_tokens": 4391536.0, + "reward": 0.76025390625, + "reward_std": 0.01708972081542015, + "rewards//mean": 0.76025390625, + "rewards//std": 0.03743833303451538, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1018, + "grad_norm": 5.856667995452881, + "kl": 1.4919790271669626, + "learning_rate": 9.790250827763565e-07, + "loss": 0.1492, + "num_tokens": 4400152.0, + "reward": 0.747802734375, + "reward_std": 0.010662312619388103, + "rewards//mean": 0.747802734375, + "rewards//std": 0.04489491134881973, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.102, + "grad_norm": 6.639697074890137, + "kl": 1.729028050787747, + "learning_rate": 9.789340386088662e-07, + "loss": 0.1729, + "num_tokens": 4408712.0, + "reward": 0.7100830078125, + "reward_std": 0.01790030673146248, + "rewards//mean": 0.7100830078125, + "rewards//std": 0.0462532602250576, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1022, + "grad_norm": 6.8193745613098145, + "kl": 1.2193750012665987, + "learning_rate": 9.788428015268026e-07, + "loss": 0.1219, + "num_tokens": 4417376.0, + "reward": 0.74932861328125, + "reward_std": 0.016750093549489975, + "rewards//mean": 0.74932861328125, + "rewards//std": 0.03208032250404358, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1024, + "grad_norm": 15.178659439086914, + "kl": 2.2706920113414526, + "learning_rate": 9.787513715669157e-07, + "loss": 0.2271, + "num_tokens": 4426096.0, + "reward": 0.7271728515625, + "reward_std": 0.015744894742965698, + "rewards//mean": 0.7271728515625, + "rewards//std": 0.04813040420413017, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1026, + "grad_norm": 7.991171360015869, + "kl": 2.079538142308593, + "learning_rate": 9.786597487660335e-07, + "loss": 0.208, + "num_tokens": 4434696.0, + "reward": 0.72113037109375, + "reward_std": 0.015818912535905838, + "rewards//mean": 0.72113037109375, + "rewards//std": 0.044497936964035034, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1028, + "grad_norm": 8.265140533447266, + "kl": 2.0102353263646364, + "learning_rate": 9.78567933161062e-07, + "loss": 0.201, + "num_tokens": 4443352.0, + "reward": 0.7506103515625, + "reward_std": 0.012508335523307323, + "rewards//mean": 0.7506103515625, + "rewards//std": 0.03863367810845375, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.103, + "grad_norm": 7.213916301727295, + "kl": 1.9426716212183237, + "learning_rate": 9.78475924788984e-07, + "loss": 0.1943, + "num_tokens": 4452008.0, + "reward": 0.74835205078125, + "reward_std": 0.02259785309433937, + "rewards//mean": 0.74835205078125, + "rewards//std": 0.04443359375, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1032, + "grad_norm": 6.833401203155518, + "kl": 1.6954651195555925, + "learning_rate": 9.783837236868609e-07, + "loss": 0.1695, + "num_tokens": 4460584.0, + "reward": 0.744384765625, + "reward_std": 0.01196884922683239, + "rewards//mean": 0.744384765625, + "rewards//std": 0.022955898195505142, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1034, + "grad_norm": 5.521451473236084, + "kl": 1.6248896569013596, + "learning_rate": 9.782913298918308e-07, + "loss": 0.1625, + "num_tokens": 4469208.0, + "reward": 0.75726318359375, + "reward_std": 0.0237879641354084, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.03943195566534996, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1036, + "grad_norm": 6.402383804321289, + "kl": 1.8084558583796024, + "learning_rate": 9.781987434411106e-07, + "loss": 0.1808, + "num_tokens": 4477880.0, + "reward": 0.73272705078125, + "reward_std": 0.012692469172179699, + "rewards//mean": 0.73272705078125, + "rewards//std": 0.03106970526278019, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1038, + "grad_norm": 5.475217819213867, + "kl": 1.6366539895534515, + "learning_rate": 9.781059643719936e-07, + "loss": 0.1637, + "num_tokens": 4486520.0, + "reward": 0.745361328125, + "reward_std": 0.020745567977428436, + "rewards//mean": 0.745361328125, + "rewards//std": 0.04173563793301582, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.104, + "grad_norm": 6.359858989715576, + "kl": 1.1945130750536919, + "learning_rate": 9.780129927218511e-07, + "loss": 0.1195, + "num_tokens": 4495064.0, + "reward": 0.74407958984375, + "reward_std": 0.020024165511131287, + "rewards//mean": 0.74407958984375, + "rewards//std": 0.03967764973640442, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1042, + "grad_norm": 5.116205215454102, + "kl": 1.5144299790263176, + "learning_rate": 9.779198285281326e-07, + "loss": 0.1514, + "num_tokens": 4503728.0, + "reward": 0.71844482421875, + "reward_std": 0.01762961782515049, + "rewards//mean": 0.71844482421875, + "rewards//std": 0.04398159682750702, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1044, + "grad_norm": 4.483882904052734, + "kl": 1.4487113784998655, + "learning_rate": 9.77826471828364e-07, + "loss": 0.1449, + "num_tokens": 4512344.0, + "reward": 0.75860595703125, + "reward_std": 0.01794547028839588, + "rewards//mean": 0.75860595703125, + "rewards//std": 0.036954399198293686, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1046, + "grad_norm": 5.407111167907715, + "kl": 1.382027082145214, + "learning_rate": 9.777329226601501e-07, + "loss": 0.1382, + "num_tokens": 4520896.0, + "reward": 0.73834228515625, + "reward_std": 0.01703845150768757, + "rewards//mean": 0.73834228515625, + "rewards//std": 0.03187534958124161, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1048, + "grad_norm": 5.94820499420166, + "kl": 1.213211888447404, + "learning_rate": 9.776391810611718e-07, + "loss": 0.1213, + "num_tokens": 4529480.0, + "reward": 0.72613525390625, + "reward_std": 0.018442852422595024, + "rewards//mean": 0.72613525390625, + "rewards//std": 0.031436365097761154, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.105, + "grad_norm": 4.362919330596924, + "kl": 1.3717845249921083, + "learning_rate": 9.775452470691885e-07, + "loss": 0.1372, + "num_tokens": 4538064.0, + "reward": 0.74176025390625, + "reward_std": 0.01696932688355446, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.03191047161817551, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1052, + "grad_norm": 6.720903396606445, + "kl": 1.71204486861825, + "learning_rate": 9.774511207220368e-07, + "loss": 0.1712, + "num_tokens": 4546688.0, + "reward": 0.76171875, + "reward_std": 0.01861170306801796, + "rewards//mean": 0.76171875, + "rewards//std": 0.04062476381659508, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1054, + "grad_norm": 5.4875311851501465, + "kl": 1.5787024535238743, + "learning_rate": 9.77356802057631e-07, + "loss": 0.1579, + "num_tokens": 4555432.0, + "reward": 0.7608642578125, + "reward_std": 0.02097097411751747, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.03687756508588791, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1056, + "grad_norm": 7.605865955352783, + "kl": 1.2236286401748657, + "learning_rate": 9.77262291113962e-07, + "loss": 0.1224, + "num_tokens": 4564064.0, + "reward": 0.75970458984375, + "reward_std": 0.019591055810451508, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.03574715927243233, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1058, + "grad_norm": 4.3834943771362305, + "kl": 1.65463693626225, + "learning_rate": 9.771675879290996e-07, + "loss": 0.1655, + "num_tokens": 4572752.0, + "reward": 0.74310302734375, + "reward_std": 0.010753463953733444, + "rewards//mean": 0.74310302734375, + "rewards//std": 0.03205152601003647, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.106, + "grad_norm": 8.616353988647461, + "kl": 1.006358283571899, + "learning_rate": 9.770726925411897e-07, + "loss": 0.1006, + "num_tokens": 4581432.0, + "reward": 0.7774658203125, + "reward_std": 0.017439065501093864, + "rewards//mean": 0.7774658203125, + "rewards//std": 0.031458329409360886, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1062, + "grad_norm": 8.021350860595703, + "kl": 1.4562967214733362, + "learning_rate": 9.769776049884563e-07, + "loss": 0.1456, + "num_tokens": 4590056.0, + "reward": 0.740234375, + "reward_std": 0.020591605454683304, + "rewards//mean": 0.740234375, + "rewards//std": 0.03929740935564041, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1064, + "grad_norm": 5.052854537963867, + "kl": 1.7414503898471594, + "learning_rate": 9.768823253092008e-07, + "loss": 0.1741, + "num_tokens": 4598776.0, + "reward": 0.74359130859375, + "reward_std": 0.01788424886763096, + "rewards//mean": 0.74359130859375, + "rewards//std": 0.029954928904771805, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1066, + "grad_norm": 5.531140327453613, + "kl": 1.4807850709185004, + "learning_rate": 9.767868535418014e-07, + "loss": 0.1481, + "num_tokens": 4607464.0, + "reward": 0.76080322265625, + "reward_std": 0.017598077654838562, + "rewards//mean": 0.76080322265625, + "rewards//std": 0.0396089144051075, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1068, + "grad_norm": 7.237030982971191, + "kl": 1.5943659655749798, + "learning_rate": 9.766911897247146e-07, + "loss": 0.1594, + "num_tokens": 4616104.0, + "reward": 0.7220458984375, + "reward_std": 0.010353684425354004, + "rewards//mean": 0.7220458984375, + "rewards//std": 0.03670475631952286, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.107, + "grad_norm": 4.178348541259766, + "kl": 1.9010947477072477, + "learning_rate": 9.765953338964734e-07, + "loss": 0.1901, + "num_tokens": 4624760.0, + "reward": 0.7391357421875, + "reward_std": 0.017037052661180496, + "rewards//mean": 0.7391357421875, + "rewards//std": 0.040835678577423096, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1072, + "grad_norm": 5.015347480773926, + "kl": 1.747239861637354, + "learning_rate": 9.76499286095689e-07, + "loss": 0.1747, + "num_tokens": 4633392.0, + "reward": 0.74774169921875, + "reward_std": 0.020298104733228683, + "rewards//mean": 0.74774169921875, + "rewards//std": 0.039234522730112076, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1074, + "grad_norm": 6.626712322235107, + "kl": 2.1457751411944628, + "learning_rate": 9.764030463610488e-07, + "loss": 0.2146, + "num_tokens": 4642072.0, + "reward": 0.748046875, + "reward_std": 0.02151212841272354, + "rewards//mean": 0.748046875, + "rewards//std": 0.040012020617723465, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1076, + "grad_norm": 14.374396324157715, + "kl": 2.592175643891096, + "learning_rate": 9.763066147313189e-07, + "loss": 0.2592, + "num_tokens": 4650720.0, + "reward": 0.75128173828125, + "reward_std": 0.015068383887410164, + "rewards//mean": 0.75128173828125, + "rewards//std": 0.03822930157184601, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1078, + "grad_norm": 4.086817264556885, + "kl": 1.9288067817687988, + "learning_rate": 9.762099912453412e-07, + "loss": 0.1929, + "num_tokens": 4659312.0, + "reward": 0.75177001953125, + "reward_std": 0.01850154623389244, + "rewards//mean": 0.75177001953125, + "rewards//std": 0.03926614671945572, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.108, + "grad_norm": 5.19843864440918, + "kl": 1.9183164574205875, + "learning_rate": 9.76113175942036e-07, + "loss": 0.1918, + "num_tokens": 4667896.0, + "reward": 0.7498779296875, + "reward_std": 0.014849013648927212, + "rewards//mean": 0.7498779296875, + "rewards//std": 0.03451990336179733, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1082, + "grad_norm": 6.632591247558594, + "kl": 1.886117585003376, + "learning_rate": 9.760161688604007e-07, + "loss": 0.1886, + "num_tokens": 4676488.0, + "reward": 0.7457275390625, + "reward_std": 0.019388720393180847, + "rewards//mean": 0.7457275390625, + "rewards//std": 0.032790929079055786, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1084, + "grad_norm": 8.33913803100586, + "kl": 2.0020735822618008, + "learning_rate": 9.759189700395095e-07, + "loss": 0.2002, + "num_tokens": 4685128.0, + "reward": 0.74432373046875, + "reward_std": 0.01459340751171112, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.0429711751639843, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1086, + "grad_norm": 5.022960186004639, + "kl": 2.1866206601262093, + "learning_rate": 9.758215795185138e-07, + "loss": 0.2187, + "num_tokens": 4693824.0, + "reward": 0.75189208984375, + "reward_std": 0.015742268413305283, + "rewards//mean": 0.75189208984375, + "rewards//std": 0.0536862388253212, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1088, + "grad_norm": 4.4368062019348145, + "kl": 1.8616263028234243, + "learning_rate": 9.757239973366428e-07, + "loss": 0.1862, + "num_tokens": 4702472.0, + "reward": 0.74822998046875, + "reward_std": 0.015865590423345566, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.04504062980413437, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.109, + "grad_norm": 4.997870922088623, + "kl": 2.5523458272218704, + "learning_rate": 9.756262235332028e-07, + "loss": 0.2552, + "num_tokens": 4711104.0, + "reward": 0.75885009765625, + "reward_std": 0.023846883326768875, + "rewards//mean": 0.75885009765625, + "rewards//std": 0.03539479151368141, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1092, + "grad_norm": 10.336973190307617, + "kl": 3.0368824899196625, + "learning_rate": 9.755282581475767e-07, + "loss": 0.3037, + "num_tokens": 4719688.0, + "reward": 0.70037841796875, + "reward_std": 0.01915045827627182, + "rewards//mean": 0.70037841796875, + "rewards//std": 0.047834914177656174, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1094, + "grad_norm": 6.205177307128906, + "kl": 1.4204385522753, + "learning_rate": 9.754301012192253e-07, + "loss": 0.142, + "num_tokens": 4728272.0, + "reward": 0.7509765625, + "reward_std": 0.019058480858802795, + "rewards//mean": 0.7509765625, + "rewards//std": 0.03560464084148407, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1096, + "grad_norm": 9.930331230163574, + "kl": 2.1349884532392025, + "learning_rate": 9.753317527876856e-07, + "loss": 0.2135, + "num_tokens": 4736888.0, + "reward": 0.7347412109375, + "reward_std": 0.011325545608997345, + "rewards//mean": 0.7347412109375, + "rewards//std": 0.035019706934690475, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1098, + "grad_norm": 15.23091983795166, + "kl": 2.7829357124865055, + "learning_rate": 9.75233212892573e-07, + "loss": 0.2783, + "num_tokens": 4745456.0, + "reward": 0.7529296875, + "reward_std": 0.016158465296030045, + "rewards//mean": 0.7529296875, + "rewards//std": 0.035781074315309525, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.11, + "grad_norm": 7.015782833099365, + "kl": 2.0335309226065874, + "learning_rate": 9.75134481573579e-07, + "loss": 0.2034, + "num_tokens": 4754000.0, + "reward": 0.77471923828125, + "reward_std": 0.013891384936869144, + "rewards//mean": 0.77471923828125, + "rewards//std": 0.028646297752857208, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1102, + "grad_norm": 4.9335432052612305, + "kl": 2.393045909702778, + "learning_rate": 9.750355588704727e-07, + "loss": 0.2393, + "num_tokens": 4762808.0, + "reward": 0.742919921875, + "reward_std": 0.020512012764811516, + "rewards//mean": 0.742919921875, + "rewards//std": 0.041292235255241394, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1104, + "grad_norm": 11.090091705322266, + "kl": 2.3459647679701447, + "learning_rate": 9.749364448231e-07, + "loss": 0.2346, + "num_tokens": 4771488.0, + "reward": 0.73565673828125, + "reward_std": 0.017126431688666344, + "rewards//mean": 0.73565673828125, + "rewards//std": 0.03207654878497124, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1106, + "grad_norm": 13.620132446289062, + "kl": 2.8425038745626807, + "learning_rate": 9.748371394713841e-07, + "loss": 0.2843, + "num_tokens": 4780144.0, + "reward": 0.72735595703125, + "reward_std": 0.016672534868121147, + "rewards//mean": 0.72735595703125, + "rewards//std": 0.039587125182151794, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1108, + "grad_norm": 6.813229084014893, + "kl": 2.043930523097515, + "learning_rate": 9.747376428553253e-07, + "loss": 0.2044, + "num_tokens": 4788744.0, + "reward": 0.703857421875, + "reward_std": 0.015327699482440948, + "rewards//mean": 0.703857421875, + "rewards//std": 0.046113792806863785, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.111, + "grad_norm": 6.221364974975586, + "kl": 2.4740346297621727, + "learning_rate": 9.746379550150008e-07, + "loss": 0.2474, + "num_tokens": 4797400.0, + "reward": 0.73052978515625, + "reward_std": 0.022129859775304794, + "rewards//mean": 0.73052978515625, + "rewards//std": 0.037217672914266586, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1112, + "grad_norm": 7.535991668701172, + "kl": 2.2564044035971165, + "learning_rate": 9.745380759905647e-07, + "loss": 0.2256, + "num_tokens": 4806144.0, + "reward": 0.7777099609375, + "reward_std": 0.015177038498222828, + "rewards//mean": 0.7777099609375, + "rewards//std": 0.036001864820718765, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1114, + "grad_norm": 4.758485317230225, + "kl": 2.0595958326011896, + "learning_rate": 9.744380058222482e-07, + "loss": 0.206, + "num_tokens": 4814792.0, + "reward": 0.7476806640625, + "reward_std": 0.023326821625232697, + "rewards//mean": 0.7476806640625, + "rewards//std": 0.049005601555109024, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1116, + "grad_norm": 4.56634521484375, + "kl": 1.2647319380193949, + "learning_rate": 9.743377445503597e-07, + "loss": 0.1265, + "num_tokens": 4823488.0, + "reward": 0.7706298828125, + "reward_std": 0.014785964973270893, + "rewards//mean": 0.7706298828125, + "rewards//std": 0.03946634382009506, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1118, + "grad_norm": 6.108590126037598, + "kl": 1.7404382824897766, + "learning_rate": 9.742372922152845e-07, + "loss": 0.174, + "num_tokens": 4832160.0, + "reward": 0.76446533203125, + "reward_std": 0.020382845774292946, + "rewards//mean": 0.76446533203125, + "rewards//std": 0.045194968581199646, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.112, + "grad_norm": 4.485249996185303, + "kl": 1.4466899689286947, + "learning_rate": 9.74136648857485e-07, + "loss": 0.1447, + "num_tokens": 4840776.0, + "reward": 0.713134765625, + "reward_std": 0.014563288539648056, + "rewards//mean": 0.713134765625, + "rewards//std": 0.04398473724722862, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1122, + "grad_norm": 5.07109260559082, + "kl": 1.5551688242703676, + "learning_rate": 9.740358145174997e-07, + "loss": 0.1555, + "num_tokens": 4849480.0, + "reward": 0.760009765625, + "reward_std": 0.021132897585630417, + "rewards//mean": 0.760009765625, + "rewards//std": 0.04068359360098839, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1124, + "grad_norm": 4.8608880043029785, + "kl": 1.7707404978573322, + "learning_rate": 9.73934789235945e-07, + "loss": 0.1771, + "num_tokens": 4858184.0, + "reward": 0.74798583984375, + "reward_std": 0.026072677224874496, + "rewards//mean": 0.74798583984375, + "rewards//std": 0.049194399267435074, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1126, + "grad_norm": 3.8644888401031494, + "kl": 1.1087552718818188, + "learning_rate": 9.73833573053514e-07, + "loss": 0.1109, + "num_tokens": 4866856.0, + "reward": 0.763427734375, + "reward_std": 0.016999846324324608, + "rewards//mean": 0.763427734375, + "rewards//std": 0.03900587931275368, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1128, + "grad_norm": 5.894697666168213, + "kl": 1.2242057928815484, + "learning_rate": 9.737321660109766e-07, + "loss": 0.1224, + "num_tokens": 4875376.0, + "reward": 0.727783203125, + "reward_std": 0.021369129419326782, + "rewards//mean": 0.727783203125, + "rewards//std": 0.04520672932267189, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.113, + "grad_norm": 5.239418983459473, + "kl": 1.495858235284686, + "learning_rate": 9.73630568149179e-07, + "loss": 0.1496, + "num_tokens": 4884008.0, + "reward": 0.71240234375, + "reward_std": 0.02082645893096924, + "rewards//mean": 0.71240234375, + "rewards//std": 0.04694470390677452, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1132, + "grad_norm": 4.3944268226623535, + "kl": 1.3522299639880657, + "learning_rate": 9.735287795090454e-07, + "loss": 0.1352, + "num_tokens": 4892672.0, + "reward": 0.75726318359375, + "reward_std": 0.018662169575691223, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.04031875729560852, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1134, + "grad_norm": 4.5940046310424805, + "kl": 1.7546975184231997, + "learning_rate": 9.734268001315759e-07, + "loss": 0.1755, + "num_tokens": 4901272.0, + "reward": 0.71612548828125, + "reward_std": 0.021254749968647957, + "rewards//mean": 0.71612548828125, + "rewards//std": 0.05108821764588356, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1136, + "grad_norm": 7.10288667678833, + "kl": 0.663728054612875, + "learning_rate": 9.733246300578482e-07, + "loss": 0.0664, + "num_tokens": 4909840.0, + "reward": 0.75848388671875, + "reward_std": 0.010412106290459633, + "rewards//mean": 0.75848388671875, + "rewards//std": 0.02831890992820263, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1138, + "grad_norm": 7.828158855438232, + "kl": 1.429208105430007, + "learning_rate": 9.73222269329016e-07, + "loss": 0.1429, + "num_tokens": 4918384.0, + "reward": 0.73187255859375, + "reward_std": 0.019287483766674995, + "rewards//mean": 0.73187255859375, + "rewards//std": 0.026887016370892525, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.114, + "grad_norm": 4.482600212097168, + "kl": 1.6363477557897568, + "learning_rate": 9.731197179863103e-07, + "loss": 0.1636, + "num_tokens": 4927096.0, + "reward": 0.7735595703125, + "reward_std": 0.015531946904957294, + "rewards//mean": 0.7735595703125, + "rewards//std": 0.043442267924547195, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1142, + "grad_norm": 7.246872425079346, + "kl": 1.1119504496455193, + "learning_rate": 9.730169760710385e-07, + "loss": 0.1112, + "num_tokens": 4935776.0, + "reward": 0.73040771484375, + "reward_std": 0.014359238557517529, + "rewards//mean": 0.73040771484375, + "rewards//std": 0.035939738154411316, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1144, + "grad_norm": 6.416889667510986, + "kl": 1.7196300886571407, + "learning_rate": 9.729140436245856e-07, + "loss": 0.172, + "num_tokens": 4944400.0, + "reward": 0.72698974609375, + "reward_std": 0.021922361105680466, + "rewards//mean": 0.72698974609375, + "rewards//std": 0.04413449019193649, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1146, + "grad_norm": 4.504310131072998, + "kl": 1.8936888501048088, + "learning_rate": 9.728109206884125e-07, + "loss": 0.1894, + "num_tokens": 4953000.0, + "reward": 0.7447509765625, + "reward_std": 0.024887755513191223, + "rewards//mean": 0.7447509765625, + "rewards//std": 0.04389292374253273, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1148, + "grad_norm": 5.786618709564209, + "kl": 1.2161879613995552, + "learning_rate": 9.72707607304057e-07, + "loss": 0.1216, + "num_tokens": 4961696.0, + "reward": 0.75347900390625, + "reward_std": 0.01602158695459366, + "rewards//mean": 0.75347900390625, + "rewards//std": 0.03863930702209473, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.115, + "grad_norm": 5.082529544830322, + "kl": 1.2490643374621868, + "learning_rate": 9.726041035131338e-07, + "loss": 0.1249, + "num_tokens": 4970296.0, + "reward": 0.7366943359375, + "reward_std": 0.013393068686127663, + "rewards//mean": 0.7366943359375, + "rewards//std": 0.03220964968204498, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1152, + "grad_norm": 5.343238830566406, + "kl": 1.8169043827801943, + "learning_rate": 9.72500409357334e-07, + "loss": 0.1817, + "num_tokens": 4979040.0, + "reward": 0.742431640625, + "reward_std": 0.01911771297454834, + "rewards//mean": 0.742431640625, + "rewards//std": 0.05153579264879227, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1154, + "grad_norm": 7.661114692687988, + "kl": 1.20258454605937, + "learning_rate": 9.723965248784262e-07, + "loss": 0.1203, + "num_tokens": 4987720.0, + "reward": 0.782958984375, + "reward_std": 0.018420346081256866, + "rewards//mean": 0.782958984375, + "rewards//std": 0.028466660529375076, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1156, + "grad_norm": 5.413031101226807, + "kl": 1.6013918295502663, + "learning_rate": 9.722924501182546e-07, + "loss": 0.1601, + "num_tokens": 4996352.0, + "reward": 0.71514892578125, + "reward_std": 0.013243299908936024, + "rewards//mean": 0.71514892578125, + "rewards//std": 0.039661239832639694, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1158, + "grad_norm": 5.931016445159912, + "kl": 1.7467154283076525, + "learning_rate": 9.721881851187405e-07, + "loss": 0.1747, + "num_tokens": 5005168.0, + "reward": 0.7479248046875, + "reward_std": 0.017642071470618248, + "rewards//mean": 0.7479248046875, + "rewards//std": 0.048654671758413315, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.116, + "grad_norm": 5.606500148773193, + "kl": 1.4434173591434956, + "learning_rate": 9.720837299218818e-07, + "loss": 0.1443, + "num_tokens": 5013744.0, + "reward": 0.75146484375, + "reward_std": 0.013743579387664795, + "rewards//mean": 0.75146484375, + "rewards//std": 0.032626405358314514, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1162, + "grad_norm": 5.018765926361084, + "kl": 1.4865971878170967, + "learning_rate": 9.719790845697532e-07, + "loss": 0.1487, + "num_tokens": 5022304.0, + "reward": 0.74420166015625, + "reward_std": 0.01578206568956375, + "rewards//mean": 0.74420166015625, + "rewards//std": 0.04040352255105972, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1164, + "grad_norm": 6.59496545791626, + "kl": 1.6979880444705486, + "learning_rate": 9.71874249104506e-07, + "loss": 0.1698, + "num_tokens": 5030944.0, + "reward": 0.69598388671875, + "reward_std": 0.0127449631690979, + "rewards//mean": 0.69598388671875, + "rewards//std": 0.03594689816236496, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1166, + "grad_norm": 4.271090030670166, + "kl": 1.6714221462607384, + "learning_rate": 9.717692235683674e-07, + "loss": 0.1671, + "num_tokens": 5039632.0, + "reward": 0.75897216796875, + "reward_std": 0.012486828491091728, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.03377307951450348, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1168, + "grad_norm": 6.605570316314697, + "kl": 2.5692805107682943, + "learning_rate": 9.716640080036423e-07, + "loss": 0.2569, + "num_tokens": 5048256.0, + "reward": 0.734130859375, + "reward_std": 0.01749209687113762, + "rewards//mean": 0.734130859375, + "rewards//std": 0.0323907844722271, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.117, + "grad_norm": 4.8662238121032715, + "kl": 1.6305445469915867, + "learning_rate": 9.715586024527109e-07, + "loss": 0.1631, + "num_tokens": 5056808.0, + "reward": 0.75189208984375, + "reward_std": 0.012177273631095886, + "rewards//mean": 0.75189208984375, + "rewards//std": 0.03622502088546753, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1172, + "grad_norm": 5.55615234375, + "kl": 2.160937760025263, + "learning_rate": 9.714530069580308e-07, + "loss": 0.2161, + "num_tokens": 5065400.0, + "reward": 0.7667236328125, + "reward_std": 0.01874316856265068, + "rewards//mean": 0.7667236328125, + "rewards//std": 0.03359900414943695, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1174, + "grad_norm": 4.025786876678467, + "kl": 2.293276358395815, + "learning_rate": 9.71347221562136e-07, + "loss": 0.2293, + "num_tokens": 5074064.0, + "reward": 0.77850341796875, + "reward_std": 0.015237913466989994, + "rewards//mean": 0.77850341796875, + "rewards//std": 0.03395189717411995, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1176, + "grad_norm": 5.526472568511963, + "kl": 1.1222283877432346, + "learning_rate": 9.712412463076367e-07, + "loss": 0.1122, + "num_tokens": 5082720.0, + "reward": 0.7344970703125, + "reward_std": 0.014784103259444237, + "rewards//mean": 0.7344970703125, + "rewards//std": 0.03695956990122795, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1178, + "grad_norm": 4.615313529968262, + "kl": 2.069821909070015, + "learning_rate": 9.711350812372196e-07, + "loss": 0.207, + "num_tokens": 5091344.0, + "reward": 0.73822021484375, + "reward_std": 0.017122317105531693, + "rewards//mean": 0.73822021484375, + "rewards//std": 0.03217785060405731, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.118, + "grad_norm": 3.021036148071289, + "kl": 1.9491257146000862, + "learning_rate": 9.710287263936483e-07, + "loss": 0.1949, + "num_tokens": 5100024.0, + "reward": 0.7535400390625, + "reward_std": 0.014991648495197296, + "rewards//mean": 0.7535400390625, + "rewards//std": 0.0286011453717947, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1182, + "grad_norm": 5.799602031707764, + "kl": 2.209879280999303, + "learning_rate": 9.709221818197623e-07, + "loss": 0.221, + "num_tokens": 5108640.0, + "reward": 0.7288818359375, + "reward_std": 0.018161766231060028, + "rewards//mean": 0.7288818359375, + "rewards//std": 0.03411585092544556, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1184, + "grad_norm": 9.670961380004883, + "kl": 2.449349695816636, + "learning_rate": 9.708154475584777e-07, + "loss": 0.2449, + "num_tokens": 5117224.0, + "reward": 0.7255859375, + "reward_std": 0.021229520440101624, + "rewards//mean": 0.7255859375, + "rewards//std": 0.05025145411491394, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1186, + "grad_norm": 7.468798637390137, + "kl": 2.5942456889897585, + "learning_rate": 9.707085236527873e-07, + "loss": 0.2594, + "num_tokens": 5125776.0, + "reward": 0.772216796875, + "reward_std": 0.01765240915119648, + "rewards//mean": 0.772216796875, + "rewards//std": 0.034187883138656616, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1188, + "grad_norm": 5.847729206085205, + "kl": 1.6708920057862997, + "learning_rate": 9.706014101457599e-07, + "loss": 0.1671, + "num_tokens": 5134408.0, + "reward": 0.77679443359375, + "reward_std": 0.020488444715738297, + "rewards//mean": 0.77679443359375, + "rewards//std": 0.02839844487607479, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.119, + "grad_norm": 4.506982326507568, + "kl": 1.6495173051953316, + "learning_rate": 9.704941070805405e-07, + "loss": 0.165, + "num_tokens": 5143040.0, + "reward": 0.75848388671875, + "reward_std": 0.015759726986289024, + "rewards//mean": 0.75848388671875, + "rewards//std": 0.03396526724100113, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1192, + "grad_norm": 5.312528610229492, + "kl": 1.6204978078603745, + "learning_rate": 9.70386614500351e-07, + "loss": 0.162, + "num_tokens": 5151704.0, + "reward": 0.74822998046875, + "reward_std": 0.01845605857670307, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.03259988874197006, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1194, + "grad_norm": 5.818426609039307, + "kl": 1.2239194139838219, + "learning_rate": 9.702789324484896e-07, + "loss": 0.1224, + "num_tokens": 5160400.0, + "reward": 0.7462158203125, + "reward_std": 0.014775009825825691, + "rewards//mean": 0.7462158203125, + "rewards//std": 0.03261503577232361, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1196, + "grad_norm": 3.984865427017212, + "kl": 1.7139388527721167, + "learning_rate": 9.701710609683305e-07, + "loss": 0.1714, + "num_tokens": 5169024.0, + "reward": 0.75494384765625, + "reward_std": 0.014658878557384014, + "rewards//mean": 0.75494384765625, + "rewards//std": 0.036619026213884354, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1198, + "grad_norm": 4.632446765899658, + "kl": 2.2801280226558447, + "learning_rate": 9.700630001033243e-07, + "loss": 0.228, + "num_tokens": 5177672.0, + "reward": 0.74786376953125, + "reward_std": 0.016614826396107674, + "rewards//mean": 0.74786376953125, + "rewards//std": 0.035276126116514206, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.12, + "grad_norm": 4.269896984100342, + "kl": 1.2402616143226624, + "learning_rate": 9.699547498969978e-07, + "loss": 0.124, + "num_tokens": 5186400.0, + "reward": 0.7333984375, + "reward_std": 0.01491763349622488, + "rewards//mean": 0.7333984375, + "rewards//std": 0.04049937054514885, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1202, + "grad_norm": 5.355497360229492, + "kl": 1.1691132951527834, + "learning_rate": 9.698463103929541e-07, + "loss": 0.1169, + "num_tokens": 5195048.0, + "reward": 0.7392578125, + "reward_std": 0.011865407228469849, + "rewards//mean": 0.7392578125, + "rewards//std": 0.0266120582818985, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1204, + "grad_norm": 4.567091941833496, + "kl": 1.8471162002533674, + "learning_rate": 9.69737681634873e-07, + "loss": 0.1847, + "num_tokens": 5203712.0, + "reward": 0.76873779296875, + "reward_std": 0.013573193922638893, + "rewards//mean": 0.76873779296875, + "rewards//std": 0.02713640034198761, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1206, + "grad_norm": 4.146628379821777, + "kl": 2.1998727172613144, + "learning_rate": 9.696288636665097e-07, + "loss": 0.22, + "num_tokens": 5212352.0, + "reward": 0.75372314453125, + "reward_std": 0.019229482859373093, + "rewards//mean": 0.75372314453125, + "rewards//std": 0.03928773105144501, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1208, + "grad_norm": 10.259482383728027, + "kl": 1.3427194859832525, + "learning_rate": 9.695198565316964e-07, + "loss": 0.1343, + "num_tokens": 5220912.0, + "reward": 0.7747802734375, + "reward_std": 0.01869441568851471, + "rewards//mean": 0.7747802734375, + "rewards//std": 0.04368827864527702, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.121, + "grad_norm": 4.669132232666016, + "kl": 1.7059618532657623, + "learning_rate": 9.69410660274341e-07, + "loss": 0.1706, + "num_tokens": 5229616.0, + "reward": 0.75067138671875, + "reward_std": 0.021110327914357185, + "rewards//mean": 0.75067138671875, + "rewards//std": 0.037406329065561295, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1212, + "grad_norm": 3.9863343238830566, + "kl": 1.500737689435482, + "learning_rate": 9.693012749384277e-07, + "loss": 0.1501, + "num_tokens": 5238192.0, + "reward": 0.72528076171875, + "reward_std": 0.011379316449165344, + "rewards//mean": 0.72528076171875, + "rewards//std": 0.0314127616584301, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1214, + "grad_norm": 5.863799571990967, + "kl": 1.294170867651701, + "learning_rate": 9.691917005680173e-07, + "loss": 0.1294, + "num_tokens": 5246720.0, + "reward": 0.7669677734375, + "reward_std": 0.013313330709934235, + "rewards//mean": 0.7669677734375, + "rewards//std": 0.034154877066612244, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1216, + "grad_norm": 4.813351154327393, + "kl": 1.274811888113618, + "learning_rate": 9.690819372072456e-07, + "loss": 0.1275, + "num_tokens": 5255328.0, + "reward": 0.7459716796875, + "reward_std": 0.014467386528849602, + "rewards//mean": 0.7459716796875, + "rewards//std": 0.03138124197721481, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1218, + "grad_norm": 4.7343244552612305, + "kl": 1.8196150474250317, + "learning_rate": 9.68971984900326e-07, + "loss": 0.182, + "num_tokens": 5263992.0, + "reward": 0.73419189453125, + "reward_std": 0.014000032097101212, + "rewards//mean": 0.73419189453125, + "rewards//std": 0.041488584131002426, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.122, + "grad_norm": 5.491872310638428, + "kl": 2.081124259158969, + "learning_rate": 9.688618436915468e-07, + "loss": 0.2081, + "num_tokens": 5272632.0, + "reward": 0.7449951171875, + "reward_std": 0.013375763781368732, + "rewards//mean": 0.7449951171875, + "rewards//std": 0.033642228692770004, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1222, + "grad_norm": 4.709848880767822, + "kl": 1.860938437283039, + "learning_rate": 9.68751513625273e-07, + "loss": 0.1861, + "num_tokens": 5281264.0, + "reward": 0.758056640625, + "reward_std": 0.012330969795584679, + "rewards//mean": 0.758056640625, + "rewards//std": 0.029950950294733047, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1224, + "grad_norm": 5.294053077697754, + "kl": 1.5603632759302855, + "learning_rate": 9.686409947459457e-07, + "loss": 0.156, + "num_tokens": 5290096.0, + "reward": 0.72772216796875, + "reward_std": 0.019234199076890945, + "rewards//mean": 0.72772216796875, + "rewards//std": 0.039900969713926315, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1226, + "grad_norm": 4.610052108764648, + "kl": 1.8261591251939535, + "learning_rate": 9.685302870980817e-07, + "loss": 0.1826, + "num_tokens": 5298720.0, + "reward": 0.7579345703125, + "reward_std": 0.020076729357242584, + "rewards//mean": 0.7579345703125, + "rewards//std": 0.03839943930506706, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1228, + "grad_norm": 5.603288173675537, + "kl": 2.471636150032282, + "learning_rate": 9.684193907262742e-07, + "loss": 0.2472, + "num_tokens": 5307344.0, + "reward": 0.726318359375, + "reward_std": 0.020474456250667572, + "rewards//mean": 0.726318359375, + "rewards//std": 0.042128413915634155, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.123, + "grad_norm": 8.127079010009766, + "kl": 1.6221202835440636, + "learning_rate": 9.68308305675192e-07, + "loss": 0.1622, + "num_tokens": 5315880.0, + "reward": 0.764404296875, + "reward_std": 0.018329406157135963, + "rewards//mean": 0.764404296875, + "rewards//std": 0.037109375, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1232, + "grad_norm": 4.38250732421875, + "kl": 1.0610403437167406, + "learning_rate": 9.681970319895802e-07, + "loss": 0.1061, + "num_tokens": 5324592.0, + "reward": 0.763427734375, + "reward_std": 0.014990486204624176, + "rewards//mean": 0.763427734375, + "rewards//std": 0.03242067992687225, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1234, + "grad_norm": 7.516284465789795, + "kl": 1.3677379209548235, + "learning_rate": 9.6808556971426e-07, + "loss": 0.1368, + "num_tokens": 5333288.0, + "reward": 0.75140380859375, + "reward_std": 0.012255040928721428, + "rewards//mean": 0.75140380859375, + "rewards//std": 0.026633594185113907, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1236, + "grad_norm": 5.273319721221924, + "kl": 1.7700269967317581, + "learning_rate": 9.679739188941283e-07, + "loss": 0.177, + "num_tokens": 5341944.0, + "reward": 0.73919677734375, + "reward_std": 0.011965281330049038, + "rewards//mean": 0.73919677734375, + "rewards//std": 0.030803028494119644, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1238, + "grad_norm": 5.451859474182129, + "kl": 1.7796277161687613, + "learning_rate": 9.678620795741582e-07, + "loss": 0.178, + "num_tokens": 5350712.0, + "reward": 0.7913818359375, + "reward_std": 0.018562760204076767, + "rewards//mean": 0.7913818359375, + "rewards//std": 0.03736203908920288, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.124, + "grad_norm": 4.603490829467773, + "kl": 1.4915839675813913, + "learning_rate": 9.677500517993982e-07, + "loss": 0.1492, + "num_tokens": 5359400.0, + "reward": 0.7509765625, + "reward_std": 0.015333266928792, + "rewards//mean": 0.7509765625, + "rewards//std": 0.03739625960588455, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1242, + "grad_norm": 4.567759990692139, + "kl": 2.169027430936694, + "learning_rate": 9.676378356149732e-07, + "loss": 0.2169, + "num_tokens": 5368040.0, + "reward": 0.739990234375, + "reward_std": 0.010931908152997494, + "rewards//mean": 0.739990234375, + "rewards//std": 0.04056435450911522, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1244, + "grad_norm": 3.2500686645507812, + "kl": 1.6362348943948746, + "learning_rate": 9.675254310660841e-07, + "loss": 0.1636, + "num_tokens": 5376656.0, + "reward": 0.73065185546875, + "reward_std": 0.01525677926838398, + "rewards//mean": 0.73065185546875, + "rewards//std": 0.04145354405045509, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1246, + "grad_norm": 3.7679014205932617, + "kl": 1.2271438892930746, + "learning_rate": 9.674128381980071e-07, + "loss": 0.1227, + "num_tokens": 5385312.0, + "reward": 0.74261474609375, + "reward_std": 0.00787612609565258, + "rewards//mean": 0.74261474609375, + "rewards//std": 0.027613617479801178, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1248, + "grad_norm": 2.993044853210449, + "kl": 1.6120636332780123, + "learning_rate": 9.67300057056095e-07, + "loss": 0.1612, + "num_tokens": 5393888.0, + "reward": 0.75115966796875, + "reward_std": 0.011264875531196594, + "rewards//mean": 0.75115966796875, + "rewards//std": 0.03326001018285751, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.125, + "grad_norm": 4.802676200866699, + "kl": 2.1080329287797213, + "learning_rate": 9.671870876857758e-07, + "loss": 0.2108, + "num_tokens": 5402496.0, + "reward": 0.752197265625, + "reward_std": 0.016877135261893272, + "rewards//mean": 0.752197265625, + "rewards//std": 0.04259154945611954, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1252, + "grad_norm": 4.552894592285156, + "kl": 1.7008184995502234, + "learning_rate": 9.670739301325534e-07, + "loss": 0.1701, + "num_tokens": 5411160.0, + "reward": 0.75225830078125, + "reward_std": 0.014657140709459782, + "rewards//mean": 0.75225830078125, + "rewards//std": 0.04453704133629799, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1254, + "grad_norm": 9.11898422241211, + "kl": 1.379117302596569, + "learning_rate": 9.669605844420078e-07, + "loss": 0.1379, + "num_tokens": 5419800.0, + "reward": 0.785400390625, + "reward_std": 0.017020780593156815, + "rewards//mean": 0.785400390625, + "rewards//std": 0.028863780200481415, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1256, + "grad_norm": 5.11296272277832, + "kl": 2.194600412622094, + "learning_rate": 9.668470506597946e-07, + "loss": 0.2195, + "num_tokens": 5428536.0, + "reward": 0.73187255859375, + "reward_std": 0.018063906580209732, + "rewards//mean": 0.73187255859375, + "rewards//std": 0.03468368574976921, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1258, + "grad_norm": 4.903803825378418, + "kl": 2.3288608007133007, + "learning_rate": 9.667333288316453e-07, + "loss": 0.2329, + "num_tokens": 5437144.0, + "reward": 0.78515625, + "reward_std": 0.016290197148919106, + "rewards//mean": 0.78515625, + "rewards//std": 0.03326234221458435, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.126, + "grad_norm": 16.58167839050293, + "kl": 3.676839765161276, + "learning_rate": 9.66619419003367e-07, + "loss": 0.3677, + "num_tokens": 5445856.0, + "reward": 0.74432373046875, + "reward_std": 0.01929108425974846, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.051831282675266266, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1262, + "grad_norm": 5.5465989112854, + "kl": 2.333956880494952, + "learning_rate": 9.665053212208426e-07, + "loss": 0.2334, + "num_tokens": 5454512.0, + "reward": 0.72186279296875, + "reward_std": 0.017179621383547783, + "rewards//mean": 0.72186279296875, + "rewards//std": 0.042921826243400574, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1264, + "grad_norm": 5.082263469696045, + "kl": 1.5779585037380457, + "learning_rate": 9.663910355300304e-07, + "loss": 0.1578, + "num_tokens": 5463144.0, + "reward": 0.75909423828125, + "reward_std": 0.018127642571926117, + "rewards//mean": 0.75909423828125, + "rewards//std": 0.03502202779054642, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1266, + "grad_norm": 5.518665313720703, + "kl": 1.1804874017834663, + "learning_rate": 9.66276561976965e-07, + "loss": 0.118, + "num_tokens": 5471768.0, + "reward": 0.74847412109375, + "reward_std": 0.01492733508348465, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.028540944680571556, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1268, + "grad_norm": 3.527820110321045, + "kl": 2.0510126557201147, + "learning_rate": 9.661619006077561e-07, + "loss": 0.2051, + "num_tokens": 5480384.0, + "reward": 0.73809814453125, + "reward_std": 0.014803184196352959, + "rewards//mean": 0.73809814453125, + "rewards//std": 0.026779266074299812, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.127, + "grad_norm": 4.203972339630127, + "kl": 2.5589918410405517, + "learning_rate": 9.660470514685895e-07, + "loss": 0.2559, + "num_tokens": 5488984.0, + "reward": 0.72760009765625, + "reward_std": 0.018420351669192314, + "rewards//mean": 0.72760009765625, + "rewards//std": 0.03252783417701721, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1272, + "grad_norm": 7.727021217346191, + "kl": 2.554689183831215, + "learning_rate": 9.659320146057262e-07, + "loss": 0.2555, + "num_tokens": 5497624.0, + "reward": 0.80303955078125, + "reward_std": 0.020463142544031143, + "rewards//mean": 0.80303955078125, + "rewards//std": 0.04383542016148567, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1274, + "grad_norm": 7.647414207458496, + "kl": 2.474998451769352, + "learning_rate": 9.65816790065503e-07, + "loss": 0.2475, + "num_tokens": 5506312.0, + "reward": 0.730712890625, + "reward_std": 0.02033475786447525, + "rewards//mean": 0.730712890625, + "rewards//std": 0.03804398328065872, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1276, + "grad_norm": 4.044209957122803, + "kl": 1.9211649019271135, + "learning_rate": 9.657013778943327e-07, + "loss": 0.1921, + "num_tokens": 5514912.0, + "reward": 0.73089599609375, + "reward_std": 0.01036878488957882, + "rewards//mean": 0.73089599609375, + "rewards//std": 0.01902388222515583, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1278, + "grad_norm": 10.105807304382324, + "kl": 1.2389811612665653, + "learning_rate": 9.65585778138703e-07, + "loss": 0.1239, + "num_tokens": 5523536.0, + "reward": 0.7843017578125, + "reward_std": 0.011940107680857182, + "rewards//mean": 0.7843017578125, + "rewards//std": 0.026761524379253387, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.128, + "grad_norm": 4.215726375579834, + "kl": 2.140462227165699, + "learning_rate": 9.654699908451776e-07, + "loss": 0.214, + "num_tokens": 5532184.0, + "reward": 0.7275390625, + "reward_std": 0.019287630915641785, + "rewards//mean": 0.7275390625, + "rewards//std": 0.034492045640945435, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1282, + "grad_norm": 7.801215171813965, + "kl": 2.9723503328859806, + "learning_rate": 9.653540160603955e-07, + "loss": 0.2972, + "num_tokens": 5540808.0, + "reward": 0.75579833984375, + "reward_std": 0.021624911576509476, + "rewards//mean": 0.75579833984375, + "rewards//std": 0.04161427170038223, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1284, + "grad_norm": 10.198945045471191, + "kl": 2.7207969166338444, + "learning_rate": 9.652378538310713e-07, + "loss": 0.2721, + "num_tokens": 5549448.0, + "reward": 0.7222900390625, + "reward_std": 0.01807771623134613, + "rewards//mean": 0.7222900390625, + "rewards//std": 0.04192002862691879, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1286, + "grad_norm": 4.129467964172363, + "kl": 1.4408370926976204, + "learning_rate": 9.651215042039953e-07, + "loss": 0.1441, + "num_tokens": 5558080.0, + "reward": 0.78790283203125, + "reward_std": 0.01506801974028349, + "rewards//mean": 0.78790283203125, + "rewards//std": 0.029104484245181084, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1288, + "grad_norm": 5.908904552459717, + "kl": 1.435167744755745, + "learning_rate": 9.650049672260333e-07, + "loss": 0.1435, + "num_tokens": 5566712.0, + "reward": 0.7703857421875, + "reward_std": 0.012270634062588215, + "rewards//mean": 0.7703857421875, + "rewards//std": 0.03359900414943695, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.129, + "grad_norm": 9.64582633972168, + "kl": 2.590956222265959, + "learning_rate": 9.648882429441256e-07, + "loss": 0.2591, + "num_tokens": 5575304.0, + "reward": 0.76251220703125, + "reward_std": 0.019254688173532486, + "rewards//mean": 0.76251220703125, + "rewards//std": 0.03916887938976288, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1292, + "grad_norm": 10.265853881835938, + "kl": 2.0684807300567627, + "learning_rate": 9.647713314052895e-07, + "loss": 0.2068, + "num_tokens": 5584080.0, + "reward": 0.75274658203125, + "reward_std": 0.016576694324612617, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.03929543495178223, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1294, + "grad_norm": 6.955067157745361, + "kl": 2.5854177810251713, + "learning_rate": 9.646542326566168e-07, + "loss": 0.2585, + "num_tokens": 5592720.0, + "reward": 0.7518310546875, + "reward_std": 0.018285535275936127, + "rewards//mean": 0.7518310546875, + "rewards//std": 0.034582991153001785, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1296, + "grad_norm": 3.627410411834717, + "kl": 1.9893319997936487, + "learning_rate": 9.645369467452745e-07, + "loss": 0.1989, + "num_tokens": 5601432.0, + "reward": 0.76531982421875, + "reward_std": 0.01493283361196518, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.022242441773414612, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1298, + "grad_norm": 4.492605209350586, + "kl": 1.9629135336726904, + "learning_rate": 9.644194737185057e-07, + "loss": 0.1963, + "num_tokens": 5610040.0, + "reward": 0.7099609375, + "reward_std": 0.014292044565081596, + "rewards//mean": 0.7099609375, + "rewards//std": 0.02810811810195446, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.13, + "grad_norm": 10.583423614501953, + "kl": 1.5659422241151333, + "learning_rate": 9.643018136236286e-07, + "loss": 0.1566, + "num_tokens": 5618752.0, + "reward": 0.733642578125, + "reward_std": 0.013740007765591145, + "rewards//mean": 0.733642578125, + "rewards//std": 0.03085121139883995, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1302, + "grad_norm": 6.3281683921813965, + "kl": 1.6083077788352966, + "learning_rate": 9.641839665080363e-07, + "loss": 0.1608, + "num_tokens": 5627320.0, + "reward": 0.7452392578125, + "reward_std": 0.012915275990962982, + "rewards//mean": 0.7452392578125, + "rewards//std": 0.02484673634171486, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1304, + "grad_norm": 5.759901523590088, + "kl": 1.907441422343254, + "learning_rate": 9.640659324191978e-07, + "loss": 0.1907, + "num_tokens": 5635952.0, + "reward": 0.76611328125, + "reward_std": 0.01220618188381195, + "rewards//mean": 0.76611328125, + "rewards//std": 0.03522507846355438, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1306, + "grad_norm": 15.900641441345215, + "kl": 1.3867097087204456, + "learning_rate": 9.639477114046572e-07, + "loss": 0.1387, + "num_tokens": 5644616.0, + "reward": 0.74920654296875, + "reward_std": 0.009857980534434319, + "rewards//mean": 0.74920654296875, + "rewards//std": 0.03010665625333786, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1308, + "grad_norm": 4.807596683502197, + "kl": 1.720539940521121, + "learning_rate": 9.63829303512034e-07, + "loss": 0.1721, + "num_tokens": 5653272.0, + "reward": 0.7734375, + "reward_std": 0.018061885610222816, + "rewards//mean": 0.7734375, + "rewards//std": 0.034400638192892075, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.131, + "grad_norm": 8.267892837524414, + "kl": 1.309445545077324, + "learning_rate": 9.63710708789023e-07, + "loss": 0.1309, + "num_tokens": 5661888.0, + "reward": 0.78753662109375, + "reward_std": 0.01528235524892807, + "rewards//mean": 0.78753662109375, + "rewards//std": 0.02812204882502556, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1312, + "grad_norm": 9.04378604888916, + "kl": 2.7586640175431967, + "learning_rate": 9.635919272833937e-07, + "loss": 0.2759, + "num_tokens": 5670504.0, + "reward": 0.72735595703125, + "reward_std": 0.018271243199706078, + "rewards//mean": 0.72735595703125, + "rewards//std": 0.039427731186151505, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1314, + "grad_norm": 4.136282920837402, + "kl": 2.15822652541101, + "learning_rate": 9.634729590429916e-07, + "loss": 0.2158, + "num_tokens": 5679168.0, + "reward": 0.76129150390625, + "reward_std": 0.015018263831734657, + "rewards//mean": 0.76129150390625, + "rewards//std": 0.03659835085272789, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1316, + "grad_norm": 6.392247676849365, + "kl": 1.6154142674058676, + "learning_rate": 9.63353804115737e-07, + "loss": 0.1615, + "num_tokens": 5687896.0, + "reward": 0.77197265625, + "reward_std": 0.018114907667040825, + "rewards//mean": 0.77197265625, + "rewards//std": 0.033476460725069046, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1318, + "grad_norm": 6.043969631195068, + "kl": 2.5147317461669445, + "learning_rate": 9.632344625496255e-07, + "loss": 0.2515, + "num_tokens": 5696520.0, + "reward": 0.7630615234375, + "reward_std": 0.02536243014037609, + "rewards//mean": 0.7630615234375, + "rewards//std": 0.03221340849995613, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.132, + "grad_norm": 6.0581955909729, + "kl": 1.5347889047116041, + "learning_rate": 9.63114934392728e-07, + "loss": 0.1535, + "num_tokens": 5705160.0, + "reward": 0.71990966796875, + "reward_std": 0.010831142775714397, + "rewards//mean": 0.71990966796875, + "rewards//std": 0.026817677542567253, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1322, + "grad_norm": 6.137468338012695, + "kl": 1.5315412282943726, + "learning_rate": 9.6299521969319e-07, + "loss": 0.1532, + "num_tokens": 5713848.0, + "reward": 0.77978515625, + "reward_std": 0.008931154385209084, + "rewards//mean": 0.77978515625, + "rewards//std": 0.023359866812825203, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1324, + "grad_norm": 6.5758233070373535, + "kl": 2.190411478281021, + "learning_rate": 9.628753184992333e-07, + "loss": 0.219, + "num_tokens": 5722440.0, + "reward": 0.770263671875, + "reward_std": 0.02048276737332344, + "rewards//mean": 0.770263671875, + "rewards//std": 0.03782690688967705, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1326, + "grad_norm": 6.5012736320495605, + "kl": 1.871248772367835, + "learning_rate": 9.627552308591533e-07, + "loss": 0.1871, + "num_tokens": 5731056.0, + "reward": 0.77105712890625, + "reward_std": 0.00970059260725975, + "rewards//mean": 0.77105712890625, + "rewards//std": 0.026508256793022156, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1328, + "grad_norm": 8.617633819580078, + "kl": 2.442201526835561, + "learning_rate": 9.62634956821322e-07, + "loss": 0.2442, + "num_tokens": 5739704.0, + "reward": 0.7359619140625, + "reward_std": 0.015514541417360306, + "rewards//mean": 0.7359619140625, + "rewards//std": 0.04491564258933067, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.133, + "grad_norm": 5.606274604797363, + "kl": 1.9821086134761572, + "learning_rate": 9.625144964341852e-07, + "loss": 0.1982, + "num_tokens": 5748272.0, + "reward": 0.75164794921875, + "reward_std": 0.012744968757033348, + "rewards//mean": 0.75164794921875, + "rewards//std": 0.02847456932067871, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1332, + "grad_norm": 8.139525413513184, + "kl": 1.9455420151352882, + "learning_rate": 9.623938497462645e-07, + "loss": 0.1946, + "num_tokens": 5756944.0, + "reward": 0.77069091796875, + "reward_std": 0.011618856340646744, + "rewards//mean": 0.77069091796875, + "rewards//std": 0.02731875702738762, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1334, + "grad_norm": 10.729142189025879, + "kl": 2.0259072836488485, + "learning_rate": 9.622730168061567e-07, + "loss": 0.2026, + "num_tokens": 5765464.0, + "reward": 0.743896484375, + "reward_std": 0.016371024772524834, + "rewards//mean": 0.743896484375, + "rewards//std": 0.039092715829610825, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1336, + "grad_norm": 5.993306636810303, + "kl": 2.1834686268121004, + "learning_rate": 9.621519976625326e-07, + "loss": 0.2183, + "num_tokens": 5774152.0, + "reward": 0.75738525390625, + "reward_std": 0.02642243169248104, + "rewards//mean": 0.75738525390625, + "rewards//std": 0.04792817682027817, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1338, + "grad_norm": 9.258829116821289, + "kl": 2.034265171736479, + "learning_rate": 9.620307923641392e-07, + "loss": 0.2034, + "num_tokens": 5782856.0, + "reward": 0.75311279296875, + "reward_std": 0.011320114135742188, + "rewards//mean": 0.75311279296875, + "rewards//std": 0.03563985973596573, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.134, + "grad_norm": 10.145259857177734, + "kl": 3.0949180014431477, + "learning_rate": 9.61909400959798e-07, + "loss": 0.3095, + "num_tokens": 5791512.0, + "reward": 0.7398681640625, + "reward_std": 0.022302493453025818, + "rewards//mean": 0.7398681640625, + "rewards//std": 0.04281752556562424, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1342, + "grad_norm": 7.549869060516357, + "kl": 3.029162682592869, + "learning_rate": 9.617878234984054e-07, + "loss": 0.3029, + "num_tokens": 5800024.0, + "reward": 0.7257080078125, + "reward_std": 0.019062813371419907, + "rewards//mean": 0.7257080078125, + "rewards//std": 0.028463203459978104, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1344, + "grad_norm": 3.5927047729492188, + "kl": 2.5494485460221767, + "learning_rate": 9.616660600289327e-07, + "loss": 0.2549, + "num_tokens": 5808632.0, + "reward": 0.74658203125, + "reward_std": 0.01976936310529709, + "rewards//mean": 0.74658203125, + "rewards//std": 0.03476830944418907, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1346, + "grad_norm": 5.675290584564209, + "kl": 2.3224766980856657, + "learning_rate": 9.615441106004262e-07, + "loss": 0.2322, + "num_tokens": 5817208.0, + "reward": 0.7484130859375, + "reward_std": 0.026001352816820145, + "rewards//mean": 0.7484130859375, + "rewards//std": 0.04079562425613403, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1348, + "grad_norm": 9.404773712158203, + "kl": 2.733437206596136, + "learning_rate": 9.614219752620072e-07, + "loss": 0.2733, + "num_tokens": 5826000.0, + "reward": 0.7508544921875, + "reward_std": 0.016492925584316254, + "rewards//mean": 0.7508544921875, + "rewards//std": 0.03719799220561981, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.135, + "grad_norm": 15.974115371704102, + "kl": 3.1622733511030674, + "learning_rate": 9.612996540628717e-07, + "loss": 0.3162, + "num_tokens": 5834584.0, + "reward": 0.74810791015625, + "reward_std": 0.021986238658428192, + "rewards//mean": 0.74810791015625, + "rewards//std": 0.04708118736743927, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1352, + "grad_norm": 13.984763145446777, + "kl": 3.1775437872856855, + "learning_rate": 9.611771470522907e-07, + "loss": 0.3178, + "num_tokens": 5843224.0, + "reward": 0.7183837890625, + "reward_std": 0.02330601029098034, + "rewards//mean": 0.7183837890625, + "rewards//std": 0.03680689260363579, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1354, + "grad_norm": 12.465259552001953, + "kl": 2.977510152384639, + "learning_rate": 9.6105445427961e-07, + "loss": 0.2978, + "num_tokens": 5851800.0, + "reward": 0.6907958984375, + "reward_std": 0.024164468050003052, + "rewards//mean": 0.6907958984375, + "rewards//std": 0.039776578545570374, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1356, + "grad_norm": 7.052098751068115, + "kl": 2.7345681935548782, + "learning_rate": 9.609315757942502e-07, + "loss": 0.2735, + "num_tokens": 5860408.0, + "reward": 0.73309326171875, + "reward_std": 0.022397270426154137, + "rewards//mean": 0.73309326171875, + "rewards//std": 0.039469942450523376, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1358, + "grad_norm": 6.85525369644165, + "kl": 1.7807729430496693, + "learning_rate": 9.608085116457068e-07, + "loss": 0.1781, + "num_tokens": 5869048.0, + "reward": 0.75848388671875, + "reward_std": 0.010272054001688957, + "rewards//mean": 0.75848388671875, + "rewards//std": 0.035156626254320145, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.136, + "grad_norm": 6.290085792541504, + "kl": 2.1790910735726357, + "learning_rate": 9.606852618835502e-07, + "loss": 0.2179, + "num_tokens": 5877704.0, + "reward": 0.75604248046875, + "reward_std": 0.01704718917608261, + "rewards//mean": 0.75604248046875, + "rewards//std": 0.043335992842912674, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1362, + "grad_norm": 4.2021484375, + "kl": 2.2223317623138428, + "learning_rate": 9.60561826557425e-07, + "loss": 0.2222, + "num_tokens": 5886232.0, + "reward": 0.74078369140625, + "reward_std": 0.021687496453523636, + "rewards//mean": 0.74078369140625, + "rewards//std": 0.035477664321660995, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1364, + "grad_norm": 5.2405476570129395, + "kl": 2.203371226787567, + "learning_rate": 9.604382057170512e-07, + "loss": 0.2203, + "num_tokens": 5894872.0, + "reward": 0.72686767578125, + "reward_std": 0.01516915112733841, + "rewards//mean": 0.72686767578125, + "rewards//std": 0.03954504057765007, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1366, + "grad_norm": 7.295520305633545, + "kl": 1.8267599921673536, + "learning_rate": 9.603143994122232e-07, + "loss": 0.1827, + "num_tokens": 5903480.0, + "reward": 0.720703125, + "reward_std": 0.016644855961203575, + "rewards//mean": 0.720703125, + "rewards//std": 0.03944505378603935, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1368, + "grad_norm": 5.629857063293457, + "kl": 1.5262629892677069, + "learning_rate": 9.601904076928102e-07, + "loss": 0.1526, + "num_tokens": 5912088.0, + "reward": 0.748046875, + "reward_std": 0.01910483092069626, + "rewards//mean": 0.748046875, + "rewards//std": 0.03967159241437912, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.137, + "grad_norm": 9.598852157592773, + "kl": 1.8031053133308887, + "learning_rate": 9.60066230608756e-07, + "loss": 0.1803, + "num_tokens": 5920760.0, + "reward": 0.776123046875, + "reward_std": 0.011279763653874397, + "rewards//mean": 0.776123046875, + "rewards//std": 0.02671085111796856, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1372, + "grad_norm": 5.456700801849365, + "kl": 2.1518442891538143, + "learning_rate": 9.599418682100792e-07, + "loss": 0.2152, + "num_tokens": 5929320.0, + "reward": 0.7392578125, + "reward_std": 0.02009446546435356, + "rewards//mean": 0.7392578125, + "rewards//std": 0.03820360451936722, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1374, + "grad_norm": 6.295849323272705, + "kl": 1.056298403069377, + "learning_rate": 9.598173205468727e-07, + "loss": 0.1056, + "num_tokens": 5937912.0, + "reward": 0.73492431640625, + "reward_std": 0.01417911984026432, + "rewards//mean": 0.73492431640625, + "rewards//std": 0.03890012949705124, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1376, + "grad_norm": 7.775590419769287, + "kl": 1.8858890049159527, + "learning_rate": 9.596925876693047e-07, + "loss": 0.1886, + "num_tokens": 5946624.0, + "reward": 0.72491455078125, + "reward_std": 0.017337076365947723, + "rewards//mean": 0.72491455078125, + "rewards//std": 0.03598225489258766, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1378, + "grad_norm": 5.867929458618164, + "kl": 1.2759409677237272, + "learning_rate": 9.595676696276171e-07, + "loss": 0.1276, + "num_tokens": 5955192.0, + "reward": 0.713134765625, + "reward_std": 0.013202743604779243, + "rewards//mean": 0.713134765625, + "rewards//std": 0.04182259738445282, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.138, + "grad_norm": 6.387970924377441, + "kl": 1.318649284541607, + "learning_rate": 9.594425664721274e-07, + "loss": 0.1319, + "num_tokens": 5963760.0, + "reward": 0.76019287109375, + "reward_std": 0.01943659968674183, + "rewards//mean": 0.76019287109375, + "rewards//std": 0.033113591372966766, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1382, + "grad_norm": 5.6375651359558105, + "kl": 1.0852784998714924, + "learning_rate": 9.593172782532267e-07, + "loss": 0.1085, + "num_tokens": 5972352.0, + "reward": 0.74737548828125, + "reward_std": 0.013980841264128685, + "rewards//mean": 0.74737548828125, + "rewards//std": 0.03712359815835953, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1384, + "grad_norm": 10.946083068847656, + "kl": 1.2588221821933985, + "learning_rate": 9.591918050213813e-07, + "loss": 0.1259, + "num_tokens": 5980920.0, + "reward": 0.748291015625, + "reward_std": 0.01414998434484005, + "rewards//mean": 0.748291015625, + "rewards//std": 0.030432282015681267, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1386, + "grad_norm": 16.50353240966797, + "kl": 1.2514857109636068, + "learning_rate": 9.590661468271318e-07, + "loss": 0.1251, + "num_tokens": 5989576.0, + "reward": 0.75640869140625, + "reward_std": 0.02333802357316017, + "rewards//mean": 0.75640869140625, + "rewards//std": 0.045641250908374786, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1388, + "grad_norm": 9.173646926879883, + "kl": 1.1957140490412712, + "learning_rate": 9.589403037210931e-07, + "loss": 0.1196, + "num_tokens": 5998160.0, + "reward": 0.7657470703125, + "reward_std": 0.020367193967103958, + "rewards//mean": 0.7657470703125, + "rewards//std": 0.04024120047688484, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.139, + "grad_norm": 5.923418998718262, + "kl": 1.862734381109476, + "learning_rate": 9.58814275753955e-07, + "loss": 0.1863, + "num_tokens": 6006776.0, + "reward": 0.74822998046875, + "reward_std": 0.021520305424928665, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.04513195529580116, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1392, + "grad_norm": 7.198389530181885, + "kl": 1.7558288034051657, + "learning_rate": 9.586880629764817e-07, + "loss": 0.1756, + "num_tokens": 6015376.0, + "reward": 0.767578125, + "reward_std": 0.019421234726905823, + "rewards//mean": 0.767578125, + "rewards//std": 0.03533834591507912, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1394, + "grad_norm": 4.785677433013916, + "kl": 2.1740710642188787, + "learning_rate": 9.585616654395112e-07, + "loss": 0.2174, + "num_tokens": 6023976.0, + "reward": 0.74737548828125, + "reward_std": 0.024873943999409676, + "rewards//mean": 0.74737548828125, + "rewards//std": 0.04465245455503464, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1396, + "grad_norm": 4.419807434082031, + "kl": 1.7555591221898794, + "learning_rate": 9.584350831939569e-07, + "loss": 0.1756, + "num_tokens": 6032584.0, + "reward": 0.77490234375, + "reward_std": 0.022312358021736145, + "rewards//mean": 0.77490234375, + "rewards//std": 0.03846580535173416, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1398, + "grad_norm": 5.374952793121338, + "kl": 2.5095388777554035, + "learning_rate": 9.58308316290806e-07, + "loss": 0.251, + "num_tokens": 6041200.0, + "reward": 0.7264404296875, + "reward_std": 0.016221458092331886, + "rewards//mean": 0.7264404296875, + "rewards//std": 0.04364251717925072, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.14, + "grad_norm": 20.355453491210938, + "kl": 2.0094070453196764, + "learning_rate": 9.581813647811197e-07, + "loss": 0.2009, + "num_tokens": 6049776.0, + "reward": 0.7491455078125, + "reward_std": 0.015299257822334766, + "rewards//mean": 0.7491455078125, + "rewards//std": 0.04100732132792473, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1402, + "grad_norm": 8.70608901977539, + "kl": 2.6194078754633665, + "learning_rate": 9.580542287160346e-07, + "loss": 0.2619, + "num_tokens": 6058352.0, + "reward": 0.75726318359375, + "reward_std": 0.01727396994829178, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.04509403929114342, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1404, + "grad_norm": 5.559673309326172, + "kl": 2.009598921984434, + "learning_rate": 9.579269081467613e-07, + "loss": 0.201, + "num_tokens": 6066912.0, + "reward": 0.73590087890625, + "reward_std": 0.01806057244539261, + "rewards//mean": 0.73590087890625, + "rewards//std": 0.042766012251377106, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1406, + "grad_norm": 7.46162748336792, + "kl": 2.4945561960339546, + "learning_rate": 9.57799403124584e-07, + "loss": 0.2495, + "num_tokens": 6075536.0, + "reward": 0.74627685546875, + "reward_std": 0.025955861434340477, + "rewards//mean": 0.74627685546875, + "rewards//std": 0.04330524057149887, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1408, + "grad_norm": 5.260041236877441, + "kl": 2.1661485619843006, + "learning_rate": 9.576717137008617e-07, + "loss": 0.2166, + "num_tokens": 6084152.0, + "reward": 0.7259521484375, + "reward_std": 0.018186451867222786, + "rewards//mean": 0.7259521484375, + "rewards//std": 0.046793390065431595, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.141, + "grad_norm": 5.460837364196777, + "kl": 2.0772647876292467, + "learning_rate": 9.575438399270278e-07, + "loss": 0.2077, + "num_tokens": 6092728.0, + "reward": 0.77392578125, + "reward_std": 0.025246405974030495, + "rewards//mean": 0.77392578125, + "rewards//std": 0.03738654404878616, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1412, + "grad_norm": 5.491000175476074, + "kl": 1.6645305138081312, + "learning_rate": 9.5741578185459e-07, + "loss": 0.1665, + "num_tokens": 6101480.0, + "reward": 0.71630859375, + "reward_std": 0.01862640306353569, + "rewards//mean": 0.71630859375, + "rewards//std": 0.052331533282995224, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1414, + "grad_norm": 4.078625679016113, + "kl": 2.264868099242449, + "learning_rate": 9.572875395351301e-07, + "loss": 0.2265, + "num_tokens": 6110208.0, + "reward": 0.72979736328125, + "reward_std": 0.02179661951959133, + "rewards//mean": 0.72979736328125, + "rewards//std": 0.04679512605071068, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1416, + "grad_norm": 4.522886753082275, + "kl": 1.8940012231469154, + "learning_rate": 9.571591130203037e-07, + "loss": 0.1894, + "num_tokens": 6118832.0, + "reward": 0.736328125, + "reward_std": 0.016552813351154327, + "rewards//mean": 0.736328125, + "rewards//std": 0.040132906287908554, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1418, + "grad_norm": 14.472113609313965, + "kl": 1.6828726809471846, + "learning_rate": 9.570305023618415e-07, + "loss": 0.1683, + "num_tokens": 6127480.0, + "reward": 0.75146484375, + "reward_std": 0.020189717411994934, + "rewards//mean": 0.75146484375, + "rewards//std": 0.04437193274497986, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.142, + "grad_norm": 4.928863048553467, + "kl": 1.7356335557997227, + "learning_rate": 9.569017076115475e-07, + "loss": 0.1736, + "num_tokens": 6136016.0, + "reward": 0.7225341796875, + "reward_std": 0.024343695491552353, + "rewards//mean": 0.7225341796875, + "rewards//std": 0.044210441410541534, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1422, + "grad_norm": 8.598285675048828, + "kl": 2.318613374605775, + "learning_rate": 9.567727288213004e-07, + "loss": 0.2319, + "num_tokens": 6144688.0, + "reward": 0.74017333984375, + "reward_std": 0.018025681376457214, + "rewards//mean": 0.74017333984375, + "rewards//std": 0.029098762199282646, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1424, + "grad_norm": 3.940495014190674, + "kl": 2.2707323767244816, + "learning_rate": 9.566435660430527e-07, + "loss": 0.2271, + "num_tokens": 6153288.0, + "reward": 0.74481201171875, + "reward_std": 0.02541990950703621, + "rewards//mean": 0.74481201171875, + "rewards//std": 0.04780135676264763, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1426, + "grad_norm": 6.630128383636475, + "kl": 1.3487122803926468, + "learning_rate": 9.565142193288312e-07, + "loss": 0.1349, + "num_tokens": 6161936.0, + "reward": 0.7525634765625, + "reward_std": 0.02380307763814926, + "rewards//mean": 0.7525634765625, + "rewards//std": 0.04971463978290558, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1428, + "grad_norm": 7.120047092437744, + "kl": 1.6414924841374159, + "learning_rate": 9.563846887307368e-07, + "loss": 0.1641, + "num_tokens": 6170696.0, + "reward": 0.76116943359375, + "reward_std": 0.017113400623202324, + "rewards//mean": 0.76116943359375, + "rewards//std": 0.03396749868988991, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.143, + "grad_norm": 6.949481010437012, + "kl": 1.45215680077672, + "learning_rate": 9.562549743009442e-07, + "loss": 0.1452, + "num_tokens": 6179384.0, + "reward": 0.77239990234375, + "reward_std": 0.019084131345152855, + "rewards//mean": 0.77239990234375, + "rewards//std": 0.038995739072561264, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1432, + "grad_norm": 6.028757572174072, + "kl": 2.0199192948639393, + "learning_rate": 9.561250760917025e-07, + "loss": 0.202, + "num_tokens": 6187896.0, + "reward": 0.75274658203125, + "reward_std": 0.017159592360258102, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.038004521280527115, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1434, + "grad_norm": 4.979818820953369, + "kl": 1.553580492734909, + "learning_rate": 9.55994994155335e-07, + "loss": 0.1554, + "num_tokens": 6196536.0, + "reward": 0.7301025390625, + "reward_std": 0.01808983087539673, + "rewards//mean": 0.7301025390625, + "rewards//std": 0.03162534162402153, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1436, + "grad_norm": 5.092225551605225, + "kl": 1.3015301302075386, + "learning_rate": 9.558647285442381e-07, + "loss": 0.1302, + "num_tokens": 6205168.0, + "reward": 0.74774169921875, + "reward_std": 0.01389441266655922, + "rewards//mean": 0.74774169921875, + "rewards//std": 0.029461177065968513, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1438, + "grad_norm": 8.318163871765137, + "kl": 2.202155739068985, + "learning_rate": 9.55734279310883e-07, + "loss": 0.2202, + "num_tokens": 6213784.0, + "reward": 0.7529296875, + "reward_std": 0.017229489982128143, + "rewards//mean": 0.7529296875, + "rewards//std": 0.03742862865328789, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.144, + "grad_norm": 8.821160316467285, + "kl": 2.473429564386606, + "learning_rate": 9.55603646507815e-07, + "loss": 0.2473, + "num_tokens": 6222432.0, + "reward": 0.74346923828125, + "reward_std": 0.018832771107554436, + "rewards//mean": 0.74346923828125, + "rewards//std": 0.03762904927134514, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1442, + "grad_norm": 17.328784942626953, + "kl": 1.1039512548595667, + "learning_rate": 9.554728301876524e-07, + "loss": 0.1104, + "num_tokens": 6231232.0, + "reward": 0.7550048828125, + "reward_std": 0.011336077004671097, + "rewards//mean": 0.7550048828125, + "rewards//std": 0.038585059344768524, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1444, + "grad_norm": 5.136916637420654, + "kl": 1.792027784511447, + "learning_rate": 9.553418304030885e-07, + "loss": 0.1792, + "num_tokens": 6239984.0, + "reward": 0.74462890625, + "reward_std": 0.021673094481229782, + "rewards//mean": 0.74462890625, + "rewards//std": 0.036027293652296066, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1446, + "grad_norm": 4.544358253479004, + "kl": 2.2820528466254473, + "learning_rate": 9.552106472068897e-07, + "loss": 0.2282, + "num_tokens": 6248576.0, + "reward": 0.73602294921875, + "reward_std": 0.02027995139360428, + "rewards//mean": 0.73602294921875, + "rewards//std": 0.03474952653050423, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1448, + "grad_norm": 6.115887641906738, + "kl": 2.1123006008565426, + "learning_rate": 9.550792806518967e-07, + "loss": 0.2112, + "num_tokens": 6257128.0, + "reward": 0.764404296875, + "reward_std": 0.01684088632464409, + "rewards//mean": 0.764404296875, + "rewards//std": 0.0381837859749794, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.145, + "grad_norm": 4.227281093597412, + "kl": 2.1123296599835157, + "learning_rate": 9.549477307910236e-07, + "loss": 0.2112, + "num_tokens": 6265768.0, + "reward": 0.7764892578125, + "reward_std": 0.0175149068236351, + "rewards//mean": 0.7764892578125, + "rewards//std": 0.02934308350086212, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1452, + "grad_norm": 4.551183223724365, + "kl": 1.2583585027605295, + "learning_rate": 9.548159976772592e-07, + "loss": 0.1258, + "num_tokens": 6274448.0, + "reward": 0.7623291015625, + "reward_std": 0.01506746280938387, + "rewards//mean": 0.7623291015625, + "rewards//std": 0.027011506259441376, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1454, + "grad_norm": 6.423621654510498, + "kl": 2.4540082439780235, + "learning_rate": 9.546840813636652e-07, + "loss": 0.2454, + "num_tokens": 6283040.0, + "reward": 0.74432373046875, + "reward_std": 0.01908603496849537, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.036972418427467346, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1456, + "grad_norm": 6.135375499725342, + "kl": 1.6809967551380396, + "learning_rate": 9.545519819033777e-07, + "loss": 0.1681, + "num_tokens": 6291704.0, + "reward": 0.7060546875, + "reward_std": 0.01635526865720749, + "rewards//mean": 0.7060546875, + "rewards//std": 0.04816451296210289, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1458, + "grad_norm": 10.800931930541992, + "kl": 1.3858015835285187, + "learning_rate": 9.544196993496062e-07, + "loss": 0.1386, + "num_tokens": 6300344.0, + "reward": 0.7584228515625, + "reward_std": 0.010101072490215302, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.03870883211493492, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.146, + "grad_norm": 5.990006923675537, + "kl": 1.5613158121705055, + "learning_rate": 9.54287233755634e-07, + "loss": 0.1561, + "num_tokens": 6308912.0, + "reward": 0.757080078125, + "reward_std": 0.02095198445022106, + "rewards//mean": 0.757080078125, + "rewards//std": 0.039756208658218384, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1462, + "grad_norm": 7.086883068084717, + "kl": 1.9290961921215057, + "learning_rate": 9.541545851748185e-07, + "loss": 0.1929, + "num_tokens": 6317680.0, + "reward": 0.73089599609375, + "reward_std": 0.01549561694264412, + "rewards//mean": 0.73089599609375, + "rewards//std": 0.032764315605163574, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1464, + "grad_norm": 4.728109359741211, + "kl": 1.6669486071914434, + "learning_rate": 9.540217536605905e-07, + "loss": 0.1667, + "num_tokens": 6326368.0, + "reward": 0.79449462890625, + "reward_std": 0.013113592751324177, + "rewards//mean": 0.79449462890625, + "rewards//std": 0.023812895640730858, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1466, + "grad_norm": 13.25939655303955, + "kl": 3.0511637795716524, + "learning_rate": 9.538887392664543e-07, + "loss": 0.3051, + "num_tokens": 6335016.0, + "reward": 0.73712158203125, + "reward_std": 0.01813594251871109, + "rewards//mean": 0.73712158203125, + "rewards//std": 0.04806726053357124, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1468, + "grad_norm": 3.8899991512298584, + "kl": 1.5022244974970818, + "learning_rate": 9.537555420459881e-07, + "loss": 0.1502, + "num_tokens": 6343744.0, + "reward": 0.76983642578125, + "reward_std": 0.016706019639968872, + "rewards//mean": 0.76983642578125, + "rewards//std": 0.03192707151174545, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.147, + "grad_norm": 3.834383487701416, + "kl": 1.3857309743762016, + "learning_rate": 9.53622162052844e-07, + "loss": 0.1386, + "num_tokens": 6352456.0, + "reward": 0.7703857421875, + "reward_std": 0.010684870183467865, + "rewards//mean": 0.7703857421875, + "rewards//std": 0.031446777284145355, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1472, + "grad_norm": 5.069825172424316, + "kl": 1.4565946031361818, + "learning_rate": 9.534885993407474e-07, + "loss": 0.1457, + "num_tokens": 6361208.0, + "reward": 0.762939453125, + "reward_std": 0.007309177424758673, + "rewards//mean": 0.762939453125, + "rewards//std": 0.01368957944214344, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1474, + "grad_norm": 5.167150974273682, + "kl": 1.761495502665639, + "learning_rate": 9.53354853963497e-07, + "loss": 0.1761, + "num_tokens": 6369800.0, + "reward": 0.75775146484375, + "reward_std": 0.012911893427371979, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.02882697992026806, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1476, + "grad_norm": 6.316102027893066, + "kl": 2.4946143217384815, + "learning_rate": 9.532209259749658e-07, + "loss": 0.2495, + "num_tokens": 6378480.0, + "reward": 0.739013671875, + "reward_std": 0.01754099503159523, + "rewards//mean": 0.739013671875, + "rewards//std": 0.03786530718207359, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1478, + "grad_norm": 3.7886784076690674, + "kl": 2.09511149674654, + "learning_rate": 9.530868154290996e-07, + "loss": 0.2095, + "num_tokens": 6387056.0, + "reward": 0.7412109375, + "reward_std": 0.02488623559474945, + "rewards//mean": 0.7412109375, + "rewards//std": 0.052757907658815384, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.148, + "grad_norm": 5.090835094451904, + "kl": 1.3681064881384373, + "learning_rate": 9.529525223799184e-07, + "loss": 0.1368, + "num_tokens": 6395720.0, + "reward": 0.77685546875, + "reward_std": 0.010407458990812302, + "rewards//mean": 0.77685546875, + "rewards//std": 0.028641607612371445, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1482, + "grad_norm": 12.468710899353027, + "kl": 1.285261558368802, + "learning_rate": 9.528180468815154e-07, + "loss": 0.1285, + "num_tokens": 6404408.0, + "reward": 0.7642822265625, + "reward_std": 0.013152715750038624, + "rewards//mean": 0.7642822265625, + "rewards//std": 0.024980410933494568, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1484, + "grad_norm": 3.6978514194488525, + "kl": 1.7975781913846731, + "learning_rate": 9.526833889880572e-07, + "loss": 0.1798, + "num_tokens": 6413088.0, + "reward": 0.76416015625, + "reward_std": 0.01844346709549427, + "rewards//mean": 0.76416015625, + "rewards//std": 0.03828277066349983, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1486, + "grad_norm": 4.001753330230713, + "kl": 1.6544487792998552, + "learning_rate": 9.525485487537841e-07, + "loss": 0.1654, + "num_tokens": 6421912.0, + "reward": 0.768798828125, + "reward_std": 0.01586316153407097, + "rewards//mean": 0.768798828125, + "rewards//std": 0.03408144786953926, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1488, + "grad_norm": 3.864835500717163, + "kl": 1.9045432973653078, + "learning_rate": 9.524135262330098e-07, + "loss": 0.1905, + "num_tokens": 6430552.0, + "reward": 0.77484130859375, + "reward_std": 0.02079934999346733, + "rewards//mean": 0.77484130859375, + "rewards//std": 0.0369359627366066, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.149, + "grad_norm": 6.792883396148682, + "kl": 1.7718728762120008, + "learning_rate": 9.522783214801211e-07, + "loss": 0.1772, + "num_tokens": 6439144.0, + "reward": 0.76434326171875, + "reward_std": 0.01758260279893875, + "rewards//mean": 0.76434326171875, + "rewards//std": 0.026512254029512405, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1492, + "grad_norm": 5.843996047973633, + "kl": 1.6257364582270384, + "learning_rate": 9.521429345495786e-07, + "loss": 0.1626, + "num_tokens": 6447728.0, + "reward": 0.7696533203125, + "reward_std": 0.017090419307351112, + "rewards//mean": 0.7696533203125, + "rewards//std": 0.029943620786070824, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1494, + "grad_norm": 5.6099066734313965, + "kl": 1.8088597785681486, + "learning_rate": 9.520073654959162e-07, + "loss": 0.1809, + "num_tokens": 6456312.0, + "reward": 0.7591552734375, + "reward_std": 0.016383163630962372, + "rewards//mean": 0.7591552734375, + "rewards//std": 0.03935418650507927, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1496, + "grad_norm": 3.0421900749206543, + "kl": 2.464435391128063, + "learning_rate": 9.518716143737409e-07, + "loss": 0.2464, + "num_tokens": 6464936.0, + "reward": 0.77044677734375, + "reward_std": 0.019843457266688347, + "rewards//mean": 0.77044677734375, + "rewards//std": 0.029361844062805176, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1498, + "grad_norm": 10.730456352233887, + "kl": 1.909607894718647, + "learning_rate": 9.517356812377335e-07, + "loss": 0.191, + "num_tokens": 6473664.0, + "reward": 0.73126220703125, + "reward_std": 0.01390963513404131, + "rewards//mean": 0.73126220703125, + "rewards//std": 0.042542073875665665, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.15, + "grad_norm": 9.618935585021973, + "kl": 2.2544579710811377, + "learning_rate": 9.515995661426477e-07, + "loss": 0.2254, + "num_tokens": 6482200.0, + "reward": 0.73931884765625, + "reward_std": 0.012467009015381336, + "rewards//mean": 0.73931884765625, + "rewards//std": 0.03363339602947235, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1502, + "grad_norm": 6.5307230949401855, + "kl": 1.706093642860651, + "learning_rate": 9.514632691433106e-07, + "loss": 0.1706, + "num_tokens": 6490712.0, + "reward": 0.74200439453125, + "reward_std": 0.012252597138285637, + "rewards//mean": 0.74200439453125, + "rewards//std": 0.04213160276412964, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1504, + "grad_norm": 6.90416955947876, + "kl": 2.5524584986269474, + "learning_rate": 9.513267902946227e-07, + "loss": 0.2552, + "num_tokens": 6499392.0, + "reward": 0.76019287109375, + "reward_std": 0.01867758482694626, + "rewards//mean": 0.76019287109375, + "rewards//std": 0.03468542918562889, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1506, + "grad_norm": 11.315291404724121, + "kl": 3.1306357700377703, + "learning_rate": 9.511901296515576e-07, + "loss": 0.3131, + "num_tokens": 6508160.0, + "reward": 0.76019287109375, + "reward_std": 0.02278970740735531, + "rewards//mean": 0.76019287109375, + "rewards//std": 0.03783806040883064, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1508, + "grad_norm": 5.300631999969482, + "kl": 1.964156363159418, + "learning_rate": 9.510532872691623e-07, + "loss": 0.1964, + "num_tokens": 6516832.0, + "reward": 0.72296142578125, + "reward_std": 0.016734154894948006, + "rewards//mean": 0.72296142578125, + "rewards//std": 0.035924993455410004, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.151, + "grad_norm": 5.230804443359375, + "kl": 1.7890691291540861, + "learning_rate": 9.509162632025569e-07, + "loss": 0.1789, + "num_tokens": 6525520.0, + "reward": 0.7613525390625, + "reward_std": 0.010999785736203194, + "rewards//mean": 0.7613525390625, + "rewards//std": 0.02713228575885296, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1512, + "grad_norm": 14.658576011657715, + "kl": 2.432515686377883, + "learning_rate": 9.507790575069345e-07, + "loss": 0.2433, + "num_tokens": 6534128.0, + "reward": 0.75335693359375, + "reward_std": 0.016607044264674187, + "rewards//mean": 0.75335693359375, + "rewards//std": 0.04579753056168556, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1514, + "grad_norm": 12.985304832458496, + "kl": 2.872597623616457, + "learning_rate": 9.506416702375617e-07, + "loss": 0.2873, + "num_tokens": 6542736.0, + "reward": 0.73870849609375, + "reward_std": 0.01667785458266735, + "rewards//mean": 0.73870849609375, + "rewards//std": 0.038795698434114456, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1516, + "grad_norm": 8.331042289733887, + "kl": 3.2442229371517897, + "learning_rate": 9.505041014497779e-07, + "loss": 0.3244, + "num_tokens": 6551352.0, + "reward": 0.74969482421875, + "reward_std": 0.020662259310483932, + "rewards//mean": 0.74969482421875, + "rewards//std": 0.04038140922784805, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1518, + "grad_norm": 10.844252586364746, + "kl": 2.4486917965114117, + "learning_rate": 9.503663511989962e-07, + "loss": 0.2449, + "num_tokens": 6560040.0, + "reward": 0.75738525390625, + "reward_std": 0.020630618557333946, + "rewards//mean": 0.75738525390625, + "rewards//std": 0.03961732238531113, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.152, + "grad_norm": 6.808208465576172, + "kl": 2.6468605156987906, + "learning_rate": 9.502284195407018e-07, + "loss": 0.2647, + "num_tokens": 6568704.0, + "reward": 0.73052978515625, + "reward_std": 0.01908203214406967, + "rewards//mean": 0.73052978515625, + "rewards//std": 0.034255944192409515, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1522, + "grad_norm": 8.170414924621582, + "kl": 2.510532608255744, + "learning_rate": 9.500903065304539e-07, + "loss": 0.2511, + "num_tokens": 6577336.0, + "reward": 0.74847412109375, + "reward_std": 0.018339160829782486, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.03979726508259773, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1524, + "grad_norm": 4.826385021209717, + "kl": 1.82676731236279, + "learning_rate": 9.499520122238845e-07, + "loss": 0.1827, + "num_tokens": 6585944.0, + "reward": 0.74761962890625, + "reward_std": 0.012161046266555786, + "rewards//mean": 0.74761962890625, + "rewards//std": 0.03418605402112007, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1526, + "grad_norm": 5.022667407989502, + "kl": 0.9693844858556986, + "learning_rate": 9.498135366766982e-07, + "loss": 0.0969, + "num_tokens": 6594568.0, + "reward": 0.76568603515625, + "reward_std": 0.011609884910285473, + "rewards//mean": 0.76568603515625, + "rewards//std": 0.02998826466500759, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1528, + "grad_norm": 7.236996173858643, + "kl": 1.1193734277039766, + "learning_rate": 9.496748799446732e-07, + "loss": 0.1119, + "num_tokens": 6603064.0, + "reward": 0.736328125, + "reward_std": 0.00927905272692442, + "rewards//mean": 0.736328125, + "rewards//std": 0.02765641175210476, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.153, + "grad_norm": 5.365597248077393, + "kl": 1.578007174655795, + "learning_rate": 9.495360420836602e-07, + "loss": 0.1578, + "num_tokens": 6611760.0, + "reward": 0.75299072265625, + "reward_std": 0.0181528739631176, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.03481176495552063, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1532, + "grad_norm": 4.389156341552734, + "kl": 1.9824351072311401, + "learning_rate": 9.493970231495834e-07, + "loss": 0.1982, + "num_tokens": 6620544.0, + "reward": 0.76898193359375, + "reward_std": 0.019655738025903702, + "rewards//mean": 0.76898193359375, + "rewards//std": 0.044705647975206375, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1534, + "grad_norm": 4.694593906402588, + "kl": 1.9320093467831612, + "learning_rate": 9.492578231984393e-07, + "loss": 0.1932, + "num_tokens": 6629192.0, + "reward": 0.74249267578125, + "reward_std": 0.012822737917304039, + "rewards//mean": 0.74249267578125, + "rewards//std": 0.03166809305548668, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1536, + "grad_norm": 3.990737199783325, + "kl": 1.785394612699747, + "learning_rate": 9.491184422862979e-07, + "loss": 0.1785, + "num_tokens": 6637832.0, + "reward": 0.72174072265625, + "reward_std": 0.013771869242191315, + "rewards//mean": 0.72174072265625, + "rewards//std": 0.036096084862947464, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1538, + "grad_norm": 3.8699402809143066, + "kl": 1.9711164645850658, + "learning_rate": 9.489788804693015e-07, + "loss": 0.1971, + "num_tokens": 6646552.0, + "reward": 0.77679443359375, + "reward_std": 0.024553906172513962, + "rewards//mean": 0.77679443359375, + "rewards//std": 0.045781660825014114, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.154, + "grad_norm": 3.6297686100006104, + "kl": 1.7416410017758608, + "learning_rate": 9.488391378036659e-07, + "loss": 0.1742, + "num_tokens": 6655176.0, + "reward": 0.7724609375, + "reward_std": 0.015902843326330185, + "rewards//mean": 0.7724609375, + "rewards//std": 0.03722097724676132, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1542, + "grad_norm": 11.240531921386719, + "kl": 1.5410529263317585, + "learning_rate": 9.486992143456791e-07, + "loss": 0.1541, + "num_tokens": 6663840.0, + "reward": 0.761962890625, + "reward_std": 0.013263905420899391, + "rewards//mean": 0.761962890625, + "rewards//std": 0.037814099341630936, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1544, + "grad_norm": 10.247224807739258, + "kl": 1.236491760239005, + "learning_rate": 9.485591101517026e-07, + "loss": 0.1236, + "num_tokens": 6672440.0, + "reward": 0.7432861328125, + "reward_std": 0.009625021368265152, + "rewards//mean": 0.7432861328125, + "rewards//std": 0.026166634634137154, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1546, + "grad_norm": 3.6303439140319824, + "kl": 1.0167049001902342, + "learning_rate": 9.4841882527817e-07, + "loss": 0.1017, + "num_tokens": 6680960.0, + "reward": 0.75933837890625, + "reward_std": 0.009588822722434998, + "rewards//mean": 0.75933837890625, + "rewards//std": 0.024974577128887177, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1548, + "grad_norm": 4.442397117614746, + "kl": 1.739076443016529, + "learning_rate": 9.482783597815882e-07, + "loss": 0.1739, + "num_tokens": 6689760.0, + "reward": 0.7567138671875, + "reward_std": 0.02197597734630108, + "rewards//mean": 0.7567138671875, + "rewards//std": 0.04350633546710014, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.155, + "grad_norm": 3.991363525390625, + "kl": 1.2640179004520178, + "learning_rate": 9.481377137185369e-07, + "loss": 0.1264, + "num_tokens": 6698392.0, + "reward": 0.77264404296875, + "reward_std": 0.0130799300968647, + "rewards//mean": 0.77264404296875, + "rewards//std": 0.029726143926382065, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1552, + "grad_norm": 7.799238681793213, + "kl": 1.2722574938088655, + "learning_rate": 9.479968871456679e-07, + "loss": 0.1272, + "num_tokens": 6707040.0, + "reward": 0.75006103515625, + "reward_std": 0.008785752579569817, + "rewards//mean": 0.75006103515625, + "rewards//std": 0.029744980856776237, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1554, + "grad_norm": 7.036464691162109, + "kl": 1.5898052733391523, + "learning_rate": 9.478558801197064e-07, + "loss": 0.159, + "num_tokens": 6715672.0, + "reward": 0.77020263671875, + "reward_std": 0.017948858439922333, + "rewards//mean": 0.77020263671875, + "rewards//std": 0.03062807209789753, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1556, + "grad_norm": 4.282284259796143, + "kl": 1.4198356419801712, + "learning_rate": 9.4771469269745e-07, + "loss": 0.142, + "num_tokens": 6724312.0, + "reward": 0.76318359375, + "reward_std": 0.016161056235432625, + "rewards//mean": 0.76318359375, + "rewards//std": 0.03690403327345848, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1558, + "grad_norm": 6.055977821350098, + "kl": 1.6245936155319214, + "learning_rate": 9.475733249357688e-07, + "loss": 0.1625, + "num_tokens": 6732968.0, + "reward": 0.761962890625, + "reward_std": 0.013400651514530182, + "rewards//mean": 0.761962890625, + "rewards//std": 0.040138185024261475, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.156, + "grad_norm": 7.814728736877441, + "kl": 2.0569280479103327, + "learning_rate": 9.474317768916059e-07, + "loss": 0.2057, + "num_tokens": 6741616.0, + "reward": 0.75927734375, + "reward_std": 0.020186103880405426, + "rewards//mean": 0.75927734375, + "rewards//std": 0.035492222756147385, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1562, + "grad_norm": 5.3690385818481445, + "kl": 0.8401883486658335, + "learning_rate": 9.472900486219768e-07, + "loss": 0.084, + "num_tokens": 6750248.0, + "reward": 0.77740478515625, + "reward_std": 0.019048381596803665, + "rewards//mean": 0.77740478515625, + "rewards//std": 0.03795868903398514, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1564, + "grad_norm": 3.8546175956726074, + "kl": 1.0615817327052355, + "learning_rate": 9.471481401839696e-07, + "loss": 0.1062, + "num_tokens": 6758784.0, + "reward": 0.76470947265625, + "reward_std": 0.01313089206814766, + "rewards//mean": 0.76470947265625, + "rewards//std": 0.03293849527835846, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1566, + "grad_norm": 6.704108715057373, + "kl": 1.195159973576665, + "learning_rate": 9.470060516347449e-07, + "loss": 0.1195, + "num_tokens": 6767448.0, + "reward": 0.7789306640625, + "reward_std": 0.015296168625354767, + "rewards//mean": 0.7789306640625, + "rewards//std": 0.027653949335217476, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1568, + "grad_norm": 7.372798442840576, + "kl": 1.1695575397461653, + "learning_rate": 9.468637830315362e-07, + "loss": 0.117, + "num_tokens": 6776224.0, + "reward": 0.71832275390625, + "reward_std": 0.012001742608845234, + "rewards//mean": 0.71832275390625, + "rewards//std": 0.04022403433918953, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.157, + "grad_norm": 9.356093406677246, + "kl": 1.0185332987457514, + "learning_rate": 9.467213344316491e-07, + "loss": 0.1019, + "num_tokens": 6784832.0, + "reward": 0.7515869140625, + "reward_std": 0.017094669863581657, + "rewards//mean": 0.7515869140625, + "rewards//std": 0.04253944754600525, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1572, + "grad_norm": 5.204826354980469, + "kl": 1.2351787276566029, + "learning_rate": 9.465787058924619e-07, + "loss": 0.1235, + "num_tokens": 6793464.0, + "reward": 0.78326416015625, + "reward_std": 0.01512465812265873, + "rewards//mean": 0.78326416015625, + "rewards//std": 0.032436493784189224, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1574, + "grad_norm": 9.974525451660156, + "kl": 1.3910132851451635, + "learning_rate": 9.464358974714252e-07, + "loss": 0.1391, + "num_tokens": 6801992.0, + "reward": 0.765869140625, + "reward_std": 0.02188878506422043, + "rewards//mean": 0.765869140625, + "rewards//std": 0.03544698655605316, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1576, + "grad_norm": 3.495143175125122, + "kl": 1.580335434526205, + "learning_rate": 9.462929092260628e-07, + "loss": 0.158, + "num_tokens": 6810688.0, + "reward": 0.73687744140625, + "reward_std": 0.012116469442844391, + "rewards//mean": 0.73687744140625, + "rewards//std": 0.03512776643037796, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1578, + "grad_norm": 5.81279182434082, + "kl": 1.55155180208385, + "learning_rate": 9.461497412139696e-07, + "loss": 0.1552, + "num_tokens": 6819336.0, + "reward": 0.75189208984375, + "reward_std": 0.01254141703248024, + "rewards//mean": 0.75189208984375, + "rewards//std": 0.04477670043706894, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.158, + "grad_norm": 5.675829887390137, + "kl": 1.8020419720560312, + "learning_rate": 9.460063934928141e-07, + "loss": 0.1802, + "num_tokens": 6828088.0, + "reward": 0.7413330078125, + "reward_std": 0.014477964490652084, + "rewards//mean": 0.7413330078125, + "rewards//std": 0.03759951889514923, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1582, + "grad_norm": 3.879511594772339, + "kl": 2.192951174452901, + "learning_rate": 9.458628661203366e-07, + "loss": 0.2193, + "num_tokens": 6836808.0, + "reward": 0.7481689453125, + "reward_std": 0.016593217849731445, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.02330244705080986, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1584, + "grad_norm": 4.301689147949219, + "kl": 1.8302797842770815, + "learning_rate": 9.4571915915435e-07, + "loss": 0.183, + "num_tokens": 6845400.0, + "reward": 0.74737548828125, + "reward_std": 0.018161125481128693, + "rewards//mean": 0.74737548828125, + "rewards//std": 0.031389620155096054, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1586, + "grad_norm": 11.259052276611328, + "kl": 1.885145427659154, + "learning_rate": 9.455752726527392e-07, + "loss": 0.1885, + "num_tokens": 6854096.0, + "reward": 0.720703125, + "reward_std": 0.017080646008253098, + "rewards//mean": 0.720703125, + "rewards//std": 0.03896316885948181, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1588, + "grad_norm": 11.50151538848877, + "kl": 1.2966818679124117, + "learning_rate": 9.454312066734622e-07, + "loss": 0.1297, + "num_tokens": 6862656.0, + "reward": 0.7252197265625, + "reward_std": 0.012704441323876381, + "rewards//mean": 0.7252197265625, + "rewards//std": 0.024351980537176132, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.159, + "grad_norm": 6.524673938751221, + "kl": 1.916914639994502, + "learning_rate": 9.452869612745483e-07, + "loss": 0.1917, + "num_tokens": 6871248.0, + "reward": 0.75933837890625, + "reward_std": 0.01959666982293129, + "rewards//mean": 0.75933837890625, + "rewards//std": 0.03807416185736656, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1592, + "grad_norm": 5.548999309539795, + "kl": 1.4934558384120464, + "learning_rate": 9.451425365140994e-07, + "loss": 0.1493, + "num_tokens": 6879968.0, + "reward": 0.75018310546875, + "reward_std": 0.013776395469903946, + "rewards//mean": 0.75018310546875, + "rewards//std": 0.03590560704469681, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1594, + "grad_norm": 7.6942138671875, + "kl": 1.9479884691536427, + "learning_rate": 9.449979324502903e-07, + "loss": 0.1948, + "num_tokens": 6888536.0, + "reward": 0.74859619140625, + "reward_std": 0.023634785786271095, + "rewards//mean": 0.74859619140625, + "rewards//std": 0.03557311370968819, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1596, + "grad_norm": 6.731383323669434, + "kl": 2.307126719504595, + "learning_rate": 9.448531491413672e-07, + "loss": 0.2307, + "num_tokens": 6897080.0, + "reward": 0.7607421875, + "reward_std": 0.013529423624277115, + "rewards//mean": 0.7607421875, + "rewards//std": 0.03001658245921135, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1598, + "grad_norm": 10.119559288024902, + "kl": 2.3343022875487804, + "learning_rate": 9.447081866456487e-07, + "loss": 0.2334, + "num_tokens": 6905712.0, + "reward": 0.75238037109375, + "reward_std": 0.013632997870445251, + "rewards//mean": 0.75238037109375, + "rewards//std": 0.03842874616384506, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.16, + "grad_norm": 7.11630392074585, + "kl": 2.8813906107097864, + "learning_rate": 9.445630450215259e-07, + "loss": 0.2881, + "num_tokens": 6914304.0, + "reward": 0.75439453125, + "reward_std": 0.019328434020280838, + "rewards//mean": 0.75439453125, + "rewards//std": 0.028877412900328636, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1602, + "grad_norm": 6.320123672485352, + "kl": 2.823726534843445, + "learning_rate": 9.444177243274617e-07, + "loss": 0.2824, + "num_tokens": 6922960.0, + "reward": 0.7745361328125, + "reward_std": 0.021904705092310905, + "rewards//mean": 0.7745361328125, + "rewards//std": 0.03933725878596306, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1604, + "grad_norm": 8.631261825561523, + "kl": 2.897394433617592, + "learning_rate": 9.442722246219913e-07, + "loss": 0.2897, + "num_tokens": 6931632.0, + "reward": 0.72723388671875, + "reward_std": 0.019145509228110313, + "rewards//mean": 0.72723388671875, + "rewards//std": 0.043193940073251724, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1606, + "grad_norm": 12.911233901977539, + "kl": 2.2146845385432243, + "learning_rate": 9.441265459637219e-07, + "loss": 0.2215, + "num_tokens": 6940272.0, + "reward": 0.7568359375, + "reward_std": 0.019703920930624008, + "rewards//mean": 0.7568359375, + "rewards//std": 0.034993976354599, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1608, + "grad_norm": 22.874841690063477, + "kl": 2.5433691050857306, + "learning_rate": 9.43980688411333e-07, + "loss": 0.2543, + "num_tokens": 6948856.0, + "reward": 0.72515869140625, + "reward_std": 0.02106834389269352, + "rewards//mean": 0.72515869140625, + "rewards//std": 0.04134201630949974, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.161, + "grad_norm": 10.111599922180176, + "kl": 2.4337373562157154, + "learning_rate": 9.438346520235758e-07, + "loss": 0.2434, + "num_tokens": 6957592.0, + "reward": 0.7315673828125, + "reward_std": 0.020331665873527527, + "rewards//mean": 0.7315673828125, + "rewards//std": 0.043103642761707306, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1612, + "grad_norm": 3.873319625854492, + "kl": 2.255781589075923, + "learning_rate": 9.436884368592739e-07, + "loss": 0.2256, + "num_tokens": 6966240.0, + "reward": 0.75885009765625, + "reward_std": 0.01403405237942934, + "rewards//mean": 0.75885009765625, + "rewards//std": 0.03315470740199089, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1614, + "grad_norm": 7.305696964263916, + "kl": 2.495479291304946, + "learning_rate": 9.435420429773227e-07, + "loss": 0.2495, + "num_tokens": 6974904.0, + "reward": 0.71661376953125, + "reward_std": 0.025170881301164627, + "rewards//mean": 0.71661376953125, + "rewards//std": 0.05750236287713051, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1616, + "grad_norm": 4.483618259429932, + "kl": 1.3268167339265347, + "learning_rate": 9.433954704366896e-07, + "loss": 0.1327, + "num_tokens": 6983504.0, + "reward": 0.77825927734375, + "reward_std": 0.015569431707262993, + "rewards//mean": 0.77825927734375, + "rewards//std": 0.033930934965610504, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1618, + "grad_norm": 3.663708209991455, + "kl": 1.371718443930149, + "learning_rate": 9.43248719296414e-07, + "loss": 0.1372, + "num_tokens": 6992216.0, + "reward": 0.75067138671875, + "reward_std": 0.014409595169126987, + "rewards//mean": 0.75067138671875, + "rewards//std": 0.03732205927371979, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.162, + "grad_norm": 5.115084648132324, + "kl": 1.7715710327029228, + "learning_rate": 9.431017896156073e-07, + "loss": 0.1772, + "num_tokens": 7000816.0, + "reward": 0.76171875, + "reward_std": 0.02263561636209488, + "rewards//mean": 0.76171875, + "rewards//std": 0.05361855775117874, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1622, + "grad_norm": 9.0412015914917, + "kl": 1.2946567293256521, + "learning_rate": 9.429546814534528e-07, + "loss": 0.1295, + "num_tokens": 7009480.0, + "reward": 0.7667236328125, + "reward_std": 0.01590133085846901, + "rewards//mean": 0.7667236328125, + "rewards//std": 0.0302694384008646, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1624, + "grad_norm": 7.579761505126953, + "kl": 1.7319267839193344, + "learning_rate": 9.428073948692054e-07, + "loss": 0.1732, + "num_tokens": 7018136.0, + "reward": 0.74456787109375, + "reward_std": 0.010734092444181442, + "rewards//mean": 0.74456787109375, + "rewards//std": 0.03490122780203819, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1626, + "grad_norm": 3.380894422531128, + "kl": 1.1995777301490307, + "learning_rate": 9.426599299221924e-07, + "loss": 0.12, + "num_tokens": 7026824.0, + "reward": 0.75201416015625, + "reward_std": 0.014339606277644634, + "rewards//mean": 0.75201416015625, + "rewards//std": 0.03800651431083679, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1628, + "grad_norm": 6.378350257873535, + "kl": 2.075379339978099, + "learning_rate": 9.425122866718127e-07, + "loss": 0.2075, + "num_tokens": 7035488.0, + "reward": 0.75030517578125, + "reward_std": 0.019947510212659836, + "rewards//mean": 0.75030517578125, + "rewards//std": 0.038846779614686966, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.163, + "grad_norm": 3.568049907684326, + "kl": 1.3485429864376783, + "learning_rate": 9.423644651775368e-07, + "loss": 0.1349, + "num_tokens": 7044072.0, + "reward": 0.75067138671875, + "reward_std": 0.01583707332611084, + "rewards//mean": 0.75067138671875, + "rewards//std": 0.027801012620329857, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1632, + "grad_norm": 5.05710506439209, + "kl": 1.1457455083727837, + "learning_rate": 9.422164654989071e-07, + "loss": 0.1146, + "num_tokens": 7052744.0, + "reward": 0.7564697265625, + "reward_std": 0.012639300897717476, + "rewards//mean": 0.7564697265625, + "rewards//std": 0.030058663338422775, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1634, + "grad_norm": 19.65692901611328, + "kl": 1.0076731331646442, + "learning_rate": 9.420682876955381e-07, + "loss": 0.1008, + "num_tokens": 7061384.0, + "reward": 0.73345947265625, + "reward_std": 0.014296118170022964, + "rewards//mean": 0.73345947265625, + "rewards//std": 0.03193797543644905, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1636, + "grad_norm": 3.6845571994781494, + "kl": 1.843559268862009, + "learning_rate": 9.419199318271156e-07, + "loss": 0.1844, + "num_tokens": 7070008.0, + "reward": 0.73309326171875, + "reward_std": 0.010316584259271622, + "rewards//mean": 0.73309326171875, + "rewards//std": 0.040529586374759674, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1638, + "grad_norm": 9.300296783447266, + "kl": 1.147592481225729, + "learning_rate": 9.417713979533974e-07, + "loss": 0.1148, + "num_tokens": 7078680.0, + "reward": 0.77520751953125, + "reward_std": 0.015799185261130333, + "rewards//mean": 0.77520751953125, + "rewards//std": 0.028133351355791092, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.164, + "grad_norm": 3.4130032062530518, + "kl": 1.270079467445612, + "learning_rate": 9.41622686134213e-07, + "loss": 0.127, + "num_tokens": 7087336.0, + "reward": 0.7349853515625, + "reward_std": 0.011345919221639633, + "rewards//mean": 0.7349853515625, + "rewards//std": 0.03531242161989212, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1642, + "grad_norm": 7.637694835662842, + "kl": 1.5577540304511786, + "learning_rate": 9.414737964294634e-07, + "loss": 0.1558, + "num_tokens": 7096016.0, + "reward": 0.75079345703125, + "reward_std": 0.018049750477075577, + "rewards//mean": 0.75079345703125, + "rewards//std": 0.03023509867489338, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1644, + "grad_norm": 4.424720287322998, + "kl": 1.630806166678667, + "learning_rate": 9.413247288991215e-07, + "loss": 0.1631, + "num_tokens": 7104736.0, + "reward": 0.71746826171875, + "reward_std": 0.009988827630877495, + "rewards//mean": 0.71746826171875, + "rewards//std": 0.03421039879322052, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1646, + "grad_norm": 5.737839698791504, + "kl": 1.7182586211711168, + "learning_rate": 9.411754836032314e-07, + "loss": 0.1718, + "num_tokens": 7113368.0, + "reward": 0.75677490234375, + "reward_std": 0.016060881316661835, + "rewards//mean": 0.75677490234375, + "rewards//std": 0.03617316484451294, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1648, + "grad_norm": 3.126258373260498, + "kl": 1.0620542783290148, + "learning_rate": 9.410260606019094e-07, + "loss": 0.1062, + "num_tokens": 7122128.0, + "reward": 0.78106689453125, + "reward_std": 0.00825115293264389, + "rewards//mean": 0.78106689453125, + "rewards//std": 0.02952379733324051, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.165, + "grad_norm": 4.497267723083496, + "kl": 1.0894294548779726, + "learning_rate": 9.408764599553428e-07, + "loss": 0.1089, + "num_tokens": 7130776.0, + "reward": 0.7479248046875, + "reward_std": 0.009177840314805508, + "rewards//mean": 0.7479248046875, + "rewards//std": 0.02511337399482727, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1652, + "grad_norm": 5.435197830200195, + "kl": 1.483167264610529, + "learning_rate": 9.40726681723791e-07, + "loss": 0.1483, + "num_tokens": 7139528.0, + "reward": 0.73291015625, + "reward_std": 0.0182212982326746, + "rewards//mean": 0.73291015625, + "rewards//std": 0.03925732895731926, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1654, + "grad_norm": 14.3667573928833, + "kl": 1.5110141914337873, + "learning_rate": 9.405767259675844e-07, + "loss": 0.1511, + "num_tokens": 7148128.0, + "reward": 0.721435546875, + "reward_std": 0.014402812346816063, + "rewards//mean": 0.721435546875, + "rewards//std": 0.05114896222949028, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1656, + "grad_norm": 9.0249605178833, + "kl": 1.7240530531853437, + "learning_rate": 9.404265927471253e-07, + "loss": 0.1724, + "num_tokens": 7156728.0, + "reward": 0.7257080078125, + "reward_std": 0.011079209856688976, + "rewards//mean": 0.7257080078125, + "rewards//std": 0.037683166563510895, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1658, + "grad_norm": 2.765624523162842, + "kl": 1.5330549646168947, + "learning_rate": 9.402762821228874e-07, + "loss": 0.1533, + "num_tokens": 7165408.0, + "reward": 0.7630615234375, + "reward_std": 0.017075177282094955, + "rewards//mean": 0.7630615234375, + "rewards//std": 0.033683598041534424, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.166, + "grad_norm": 4.675115585327148, + "kl": 1.9389816485345364, + "learning_rate": 9.401257941554156e-07, + "loss": 0.1939, + "num_tokens": 7174016.0, + "reward": 0.757568359375, + "reward_std": 0.01400386355817318, + "rewards//mean": 0.757568359375, + "rewards//std": 0.043995749205350876, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1662, + "grad_norm": 5.841732978820801, + "kl": 2.3723955769091845, + "learning_rate": 9.399751289053266e-07, + "loss": 0.2372, + "num_tokens": 7182656.0, + "reward": 0.74578857421875, + "reward_std": 0.020208366215229034, + "rewards//mean": 0.74578857421875, + "rewards//std": 0.03778481483459473, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1664, + "grad_norm": 4.87501859664917, + "kl": 1.6249112337827682, + "learning_rate": 9.398242864333083e-07, + "loss": 0.1625, + "num_tokens": 7191360.0, + "reward": 0.7568359375, + "reward_std": 0.008860049769282341, + "rewards//mean": 0.7568359375, + "rewards//std": 0.037311967462301254, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1666, + "grad_norm": 5.310845851898193, + "kl": 2.1611604560166597, + "learning_rate": 9.396732668001199e-07, + "loss": 0.2161, + "num_tokens": 7200080.0, + "reward": 0.7513427734375, + "reward_std": 0.016101088374853134, + "rewards//mean": 0.7513427734375, + "rewards//std": 0.026358000934123993, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1668, + "grad_norm": 2.9059054851531982, + "kl": 1.839319683611393, + "learning_rate": 9.395220700665922e-07, + "loss": 0.1839, + "num_tokens": 7208816.0, + "reward": 0.76507568359375, + "reward_std": 0.019753575325012207, + "rewards//mean": 0.76507568359375, + "rewards//std": 0.03695235028862953, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.167, + "grad_norm": 5.503809928894043, + "kl": 2.153663218021393, + "learning_rate": 9.393706962936274e-07, + "loss": 0.2154, + "num_tokens": 7217488.0, + "reward": 0.75311279296875, + "reward_std": 0.017211418598890305, + "rewards//mean": 0.75311279296875, + "rewards//std": 0.03834396228194237, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1672, + "grad_norm": 4.380270957946777, + "kl": 2.353607654571533, + "learning_rate": 9.392191455421987e-07, + "loss": 0.2354, + "num_tokens": 7226024.0, + "reward": 0.7587890625, + "reward_std": 0.017819222062826157, + "rewards//mean": 0.7587890625, + "rewards//std": 0.03496628254652023, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1674, + "grad_norm": 5.176096439361572, + "kl": 2.640985831618309, + "learning_rate": 9.390674178733507e-07, + "loss": 0.2641, + "num_tokens": 7234672.0, + "reward": 0.7403564453125, + "reward_std": 0.022800642997026443, + "rewards//mean": 0.7403564453125, + "rewards//std": 0.04057685285806656, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1676, + "grad_norm": 7.680821895599365, + "kl": 2.7808349207043648, + "learning_rate": 9.389155133481992e-07, + "loss": 0.2781, + "num_tokens": 7243248.0, + "reward": 0.717041015625, + "reward_std": 0.018634147942066193, + "rewards//mean": 0.717041015625, + "rewards//std": 0.03916081041097641, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1678, + "grad_norm": 10.537120819091797, + "kl": 2.879847614094615, + "learning_rate": 9.387634320279314e-07, + "loss": 0.288, + "num_tokens": 7252024.0, + "reward": 0.73858642578125, + "reward_std": 0.01671629585325718, + "rewards//mean": 0.73858642578125, + "rewards//std": 0.033834878355264664, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.168, + "grad_norm": 6.859736919403076, + "kl": 2.1426927223801613, + "learning_rate": 9.386111739738056e-07, + "loss": 0.2143, + "num_tokens": 7260632.0, + "reward": 0.75164794921875, + "reward_std": 0.011810492724180222, + "rewards//mean": 0.75164794921875, + "rewards//std": 0.034406304359436035, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1682, + "grad_norm": 8.232377052307129, + "kl": 2.640538850799203, + "learning_rate": 9.384587392471514e-07, + "loss": 0.2641, + "num_tokens": 7269232.0, + "reward": 0.75140380859375, + "reward_std": 0.023731769993901253, + "rewards//mean": 0.75140380859375, + "rewards//std": 0.03931199759244919, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1684, + "grad_norm": 6.554054260253906, + "kl": 1.5579719077795744, + "learning_rate": 9.383061279093696e-07, + "loss": 0.1558, + "num_tokens": 7277888.0, + "reward": 0.74188232421875, + "reward_std": 0.014754341915249825, + "rewards//mean": 0.74188232421875, + "rewards//std": 0.023850372061133385, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1686, + "grad_norm": 6.298922061920166, + "kl": 1.7747175488620996, + "learning_rate": 9.381533400219317e-07, + "loss": 0.1775, + "num_tokens": 7286528.0, + "reward": 0.76763916015625, + "reward_std": 0.01292702741920948, + "rewards//mean": 0.76763916015625, + "rewards//std": 0.03302844986319542, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1688, + "grad_norm": 4.180309772491455, + "kl": 1.7337059956043959, + "learning_rate": 9.38000375646381e-07, + "loss": 0.1734, + "num_tokens": 7295168.0, + "reward": 0.753662109375, + "reward_std": 0.01970200054347515, + "rewards//mean": 0.753662109375, + "rewards//std": 0.037724319845438004, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.169, + "grad_norm": 3.2928497791290283, + "kl": 2.2027333453297615, + "learning_rate": 9.378472348443314e-07, + "loss": 0.2203, + "num_tokens": 7303800.0, + "reward": 0.7252197265625, + "reward_std": 0.0142702367156744, + "rewards//mean": 0.7252197265625, + "rewards//std": 0.024785738438367844, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1692, + "grad_norm": 4.348998069763184, + "kl": 1.6784103065729141, + "learning_rate": 9.376939176774677e-07, + "loss": 0.1678, + "num_tokens": 7312504.0, + "reward": 0.77056884765625, + "reward_std": 0.01693711057305336, + "rewards//mean": 0.77056884765625, + "rewards//std": 0.039255738258361816, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1694, + "grad_norm": 4.055181503295898, + "kl": 2.1964838411659002, + "learning_rate": 9.375404242075466e-07, + "loss": 0.2196, + "num_tokens": 7321176.0, + "reward": 0.748291015625, + "reward_std": 0.018741052597761154, + "rewards//mean": 0.748291015625, + "rewards//std": 0.036384351551532745, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1696, + "grad_norm": 4.051187515258789, + "kl": 1.29633485712111, + "learning_rate": 9.373867544963948e-07, + "loss": 0.1296, + "num_tokens": 7329792.0, + "reward": 0.7618408203125, + "reward_std": 0.013074061833322048, + "rewards//mean": 0.7618408203125, + "rewards//std": 0.028279660269618034, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1698, + "grad_norm": 7.050297737121582, + "kl": 0.9690040778368711, + "learning_rate": 9.372329086059107e-07, + "loss": 0.0969, + "num_tokens": 7338440.0, + "reward": 0.77362060546875, + "reward_std": 0.009072763845324516, + "rewards//mean": 0.77362060546875, + "rewards//std": 0.021947842091321945, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.17, + "grad_norm": 3.919147491455078, + "kl": 1.2022516168653965, + "learning_rate": 9.370788865980632e-07, + "loss": 0.1202, + "num_tokens": 7347016.0, + "reward": 0.75799560546875, + "reward_std": 0.013911853544414043, + "rewards//mean": 0.75799560546875, + "rewards//std": 0.027307672426104546, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1702, + "grad_norm": 4.519155979156494, + "kl": 1.4689824804663658, + "learning_rate": 9.369246885348925e-07, + "loss": 0.1469, + "num_tokens": 7355664.0, + "reward": 0.700439453125, + "reward_std": 0.00852000992745161, + "rewards//mean": 0.700439453125, + "rewards//std": 0.035481132566928864, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1704, + "grad_norm": 6.284739971160889, + "kl": 2.6096420623362064, + "learning_rate": 9.367703144785095e-07, + "loss": 0.261, + "num_tokens": 7364392.0, + "reward": 0.75140380859375, + "reward_std": 0.018157094717025757, + "rewards//mean": 0.75140380859375, + "rewards//std": 0.04684588685631752, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1706, + "grad_norm": 2.89241886138916, + "kl": 1.513953823596239, + "learning_rate": 9.366157644910959e-07, + "loss": 0.1514, + "num_tokens": 7373032.0, + "reward": 0.741943359375, + "reward_std": 0.012907277792692184, + "rewards//mean": 0.741943359375, + "rewards//std": 0.035535700619220734, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1708, + "grad_norm": 6.478793621063232, + "kl": 1.2654035575687885, + "learning_rate": 9.364610386349047e-07, + "loss": 0.1265, + "num_tokens": 7381688.0, + "reward": 0.75518798828125, + "reward_std": 0.01790492609143257, + "rewards//mean": 0.75518798828125, + "rewards//std": 0.028203211724758148, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.171, + "grad_norm": 4.850090980529785, + "kl": 1.4152612686157227, + "learning_rate": 9.363061369722594e-07, + "loss": 0.1415, + "num_tokens": 7390344.0, + "reward": 0.76995849609375, + "reward_std": 0.014351680874824524, + "rewards//mean": 0.76995849609375, + "rewards//std": 0.027936264872550964, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1712, + "grad_norm": 5.298336982727051, + "kl": 1.8484514653682709, + "learning_rate": 9.361510595655544e-07, + "loss": 0.1848, + "num_tokens": 7398960.0, + "reward": 0.7615966796875, + "reward_std": 0.017985302954912186, + "rewards//mean": 0.7615966796875, + "rewards//std": 0.032068345695734024, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1714, + "grad_norm": 3.7696399688720703, + "kl": 1.8542726691812277, + "learning_rate": 9.359958064772546e-07, + "loss": 0.1854, + "num_tokens": 7407632.0, + "reward": 0.7333984375, + "reward_std": 0.015964325517416, + "rewards//mean": 0.7333984375, + "rewards//std": 0.03637852892279625, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1716, + "grad_norm": 4.572277069091797, + "kl": 2.77557560056448, + "learning_rate": 9.35840377769896e-07, + "loss": 0.2776, + "num_tokens": 7416224.0, + "reward": 0.7694091796875, + "reward_std": 0.02960910275578499, + "rewards//mean": 0.7694091796875, + "rewards//std": 0.03893188014626503, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1718, + "grad_norm": 4.572153091430664, + "kl": 2.2407423984259367, + "learning_rate": 9.356847735060856e-07, + "loss": 0.2241, + "num_tokens": 7424872.0, + "reward": 0.7784423828125, + "reward_std": 0.023178264498710632, + "rewards//mean": 0.7784423828125, + "rewards//std": 0.034340519458055496, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.172, + "grad_norm": 4.627325534820557, + "kl": 1.7834851872175932, + "learning_rate": 9.355289937485004e-07, + "loss": 0.1783, + "num_tokens": 7433544.0, + "reward": 0.765625, + "reward_std": 0.01757705584168434, + "rewards//mean": 0.765625, + "rewards//std": 0.03353790193796158, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1722, + "grad_norm": 7.0663580894470215, + "kl": 2.7586093079298735, + "learning_rate": 9.353730385598886e-07, + "loss": 0.2759, + "num_tokens": 7442144.0, + "reward": 0.756103515625, + "reward_std": 0.023987147957086563, + "rewards//mean": 0.756103515625, + "rewards//std": 0.039228782057762146, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1724, + "grad_norm": 7.226685523986816, + "kl": 2.695838078856468, + "learning_rate": 9.35216908003069e-07, + "loss": 0.2696, + "num_tokens": 7450776.0, + "reward": 0.74609375, + "reward_std": 0.020054344087839127, + "rewards//mean": 0.74609375, + "rewards//std": 0.041805945336818695, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1726, + "grad_norm": 14.021259307861328, + "kl": 2.264453437179327, + "learning_rate": 9.350606021409308e-07, + "loss": 0.2264, + "num_tokens": 7459440.0, + "reward": 0.77838134765625, + "reward_std": 0.022445324808359146, + "rewards//mean": 0.77838134765625, + "rewards//std": 0.029183952137827873, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1728, + "grad_norm": 5.810389518737793, + "kl": 2.81511683575809, + "learning_rate": 9.349041210364341e-07, + "loss": 0.2815, + "num_tokens": 7468160.0, + "reward": 0.76959228515625, + "reward_std": 0.01997082307934761, + "rewards//mean": 0.76959228515625, + "rewards//std": 0.032885145395994186, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.173, + "grad_norm": 4.912047863006592, + "kl": 1.9661368392407894, + "learning_rate": 9.347474647526095e-07, + "loss": 0.1966, + "num_tokens": 7476752.0, + "reward": 0.7652587890625, + "reward_std": 0.011315623298287392, + "rewards//mean": 0.7652587890625, + "rewards//std": 0.019768141210079193, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1732, + "grad_norm": 6.937151908874512, + "kl": 2.165360538288951, + "learning_rate": 9.34590633352558e-07, + "loss": 0.2165, + "num_tokens": 7485448.0, + "reward": 0.76153564453125, + "reward_std": 0.02091323770582676, + "rewards//mean": 0.76153564453125, + "rewards//std": 0.03633475676178932, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1734, + "grad_norm": 46.60414505004883, + "kl": 4.745315488427877, + "learning_rate": 9.344336268994515e-07, + "loss": 0.4745, + "num_tokens": 7494104.0, + "reward": 0.71124267578125, + "reward_std": 0.024051643908023834, + "rewards//mean": 0.71124267578125, + "rewards//std": 0.038256216794252396, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1736, + "grad_norm": 4.389271259307861, + "kl": 1.9652090054005384, + "learning_rate": 9.342764454565319e-07, + "loss": 0.1965, + "num_tokens": 7502752.0, + "reward": 0.75091552734375, + "reward_std": 0.016398733481764793, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.03186964988708496, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1738, + "grad_norm": 3.826873779296875, + "kl": 1.7477713953703642, + "learning_rate": 9.341190890871121e-07, + "loss": 0.1748, + "num_tokens": 7511400.0, + "reward": 0.71832275390625, + "reward_std": 0.01658746972680092, + "rewards//mean": 0.71832275390625, + "rewards//std": 0.03841298818588257, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.174, + "grad_norm": 15.073668479919434, + "kl": 2.9960688669234514, + "learning_rate": 9.339615578545752e-07, + "loss": 0.2996, + "num_tokens": 7520040.0, + "reward": 0.7525634765625, + "reward_std": 0.020237531512975693, + "rewards//mean": 0.7525634765625, + "rewards//std": 0.03274288401007652, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1742, + "grad_norm": 7.638735771179199, + "kl": 2.2340067364275455, + "learning_rate": 9.338038518223745e-07, + "loss": 0.2234, + "num_tokens": 7528672.0, + "reward": 0.77197265625, + "reward_std": 0.025480207055807114, + "rewards//mean": 0.77197265625, + "rewards//std": 0.03820677474141121, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1744, + "grad_norm": 7.6421051025390625, + "kl": 1.5636887550354004, + "learning_rate": 9.336459710540343e-07, + "loss": 0.1564, + "num_tokens": 7537480.0, + "reward": 0.75494384765625, + "reward_std": 0.01364810299128294, + "rewards//mean": 0.75494384765625, + "rewards//std": 0.031337011605501175, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1746, + "grad_norm": 5.859604835510254, + "kl": 1.966890512034297, + "learning_rate": 9.334879156131488e-07, + "loss": 0.1967, + "num_tokens": 7546136.0, + "reward": 0.77093505859375, + "reward_std": 0.02118687331676483, + "rewards//mean": 0.77093505859375, + "rewards//std": 0.030870771035552025, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1748, + "grad_norm": 19.557403564453125, + "kl": 2.767997670918703, + "learning_rate": 9.333296855633827e-07, + "loss": 0.2768, + "num_tokens": 7554760.0, + "reward": 0.75396728515625, + "reward_std": 0.018575340509414673, + "rewards//mean": 0.75396728515625, + "rewards//std": 0.03726319968700409, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.175, + "grad_norm": 7.539550304412842, + "kl": 2.011590525507927, + "learning_rate": 9.331712809684711e-07, + "loss": 0.2012, + "num_tokens": 7563360.0, + "reward": 0.77105712890625, + "reward_std": 0.018703941255807877, + "rewards//mean": 0.77105712890625, + "rewards//std": 0.03832145035266876, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1752, + "grad_norm": 6.650249481201172, + "kl": 1.3714905809611082, + "learning_rate": 9.330127018922193e-07, + "loss": 0.1371, + "num_tokens": 7572032.0, + "reward": 0.75164794921875, + "reward_std": 0.0170736201107502, + "rewards//mean": 0.75164794921875, + "rewards//std": 0.03142625093460083, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1754, + "grad_norm": 6.059285640716553, + "kl": 1.7188973929733038, + "learning_rate": 9.32853948398503e-07, + "loss": 0.1719, + "num_tokens": 7580640.0, + "reward": 0.73638916015625, + "reward_std": 0.018722107633948326, + "rewards//mean": 0.73638916015625, + "rewards//std": 0.03858206793665886, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1756, + "grad_norm": 7.04297399520874, + "kl": 1.3471319321542978, + "learning_rate": 9.32695020551268e-07, + "loss": 0.1347, + "num_tokens": 7589144.0, + "reward": 0.7181396484375, + "reward_std": 0.006738506257534027, + "rewards//mean": 0.7181396484375, + "rewards//std": 0.0360657200217247, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1758, + "grad_norm": 14.926488876342773, + "kl": 1.8079385627061129, + "learning_rate": 9.325359184145305e-07, + "loss": 0.1808, + "num_tokens": 7597784.0, + "reward": 0.73773193359375, + "reward_std": 0.009101016446948051, + "rewards//mean": 0.73773193359375, + "rewards//std": 0.02999129332602024, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.176, + "grad_norm": 6.365018844604492, + "kl": 1.6669358722865582, + "learning_rate": 9.323766420523767e-07, + "loss": 0.1667, + "num_tokens": 7606448.0, + "reward": 0.75244140625, + "reward_std": 0.013080522418022156, + "rewards//mean": 0.75244140625, + "rewards//std": 0.02585030160844326, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1762, + "grad_norm": 5.74120569229126, + "kl": 1.0877815838903189, + "learning_rate": 9.322171915289633e-07, + "loss": 0.1088, + "num_tokens": 7615096.0, + "reward": 0.74676513671875, + "reward_std": 0.011526472866535187, + "rewards//mean": 0.74676513671875, + "rewards//std": 0.030588505789637566, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1764, + "grad_norm": 4.031838893890381, + "kl": 2.4308047238737345, + "learning_rate": 9.320575669085169e-07, + "loss": 0.2431, + "num_tokens": 7623656.0, + "reward": 0.7398681640625, + "reward_std": 0.021138392388820648, + "rewards//mean": 0.7398681640625, + "rewards//std": 0.03511640056967735, + "step": 882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1766, + "grad_norm": 5.192139148712158, + "kl": 1.3053984548896551, + "learning_rate": 9.31897768255334e-07, + "loss": 0.1305, + "num_tokens": 7632288.0, + "reward": 0.75262451171875, + "reward_std": 0.014959865249693394, + "rewards//mean": 0.75262451171875, + "rewards//std": 0.03647862374782562, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1768, + "grad_norm": 6.582054615020752, + "kl": 1.6043333057314157, + "learning_rate": 9.317377956337818e-07, + "loss": 0.1604, + "num_tokens": 7640848.0, + "reward": 0.72216796875, + "reward_std": 0.011610300280153751, + "rewards//mean": 0.72216796875, + "rewards//std": 0.03053259663283825, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.177, + "grad_norm": 3.4720489978790283, + "kl": 1.6192547511309385, + "learning_rate": 9.315776491082972e-07, + "loss": 0.1619, + "num_tokens": 7649536.0, + "reward": 0.7227783203125, + "reward_std": 0.017359483987092972, + "rewards//mean": 0.7227783203125, + "rewards//std": 0.03739605471491814, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1772, + "grad_norm": 4.207780361175537, + "kl": 2.168350428342819, + "learning_rate": 9.314173287433872e-07, + "loss": 0.2168, + "num_tokens": 7658176.0, + "reward": 0.7642822265625, + "reward_std": 0.01290786825120449, + "rewards//mean": 0.7642822265625, + "rewards//std": 0.02707419916987419, + "step": 886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1774, + "grad_norm": 6.088476657867432, + "kl": 1.813038071617484, + "learning_rate": 9.312568346036287e-07, + "loss": 0.1813, + "num_tokens": 7666712.0, + "reward": 0.76885986328125, + "reward_std": 0.0204804427921772, + "rewards//mean": 0.76885986328125, + "rewards//std": 0.03475475311279297, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1776, + "grad_norm": 5.763800144195557, + "kl": 1.953675914555788, + "learning_rate": 9.310961667536688e-07, + "loss": 0.1954, + "num_tokens": 7675416.0, + "reward": 0.75189208984375, + "reward_std": 0.009357169270515442, + "rewards//mean": 0.75189208984375, + "rewards//std": 0.038190871477127075, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1778, + "grad_norm": 8.297173500061035, + "kl": 1.1260207556188107, + "learning_rate": 9.309353252582245e-07, + "loss": 0.1126, + "num_tokens": 7684048.0, + "reward": 0.7529296875, + "reward_std": 0.014209197834134102, + "rewards//mean": 0.7529296875, + "rewards//std": 0.03625848889350891, + "step": 889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.178, + "grad_norm": 4.038049697875977, + "kl": 1.6453554145991802, + "learning_rate": 9.307743101820827e-07, + "loss": 0.1645, + "num_tokens": 7692680.0, + "reward": 0.76947021484375, + "reward_std": 0.017768949270248413, + "rewards//mean": 0.76947021484375, + "rewards//std": 0.030076975002884865, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1782, + "grad_norm": 3.143575668334961, + "kl": 1.7374642584472895, + "learning_rate": 9.306131215901003e-07, + "loss": 0.1737, + "num_tokens": 7701288.0, + "reward": 0.75341796875, + "reward_std": 0.01818675920367241, + "rewards//mean": 0.75341796875, + "rewards//std": 0.04244700446724892, + "step": 891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1784, + "grad_norm": 2.6063413619995117, + "kl": 1.068063260987401, + "learning_rate": 9.304517595472039e-07, + "loss": 0.1068, + "num_tokens": 7709928.0, + "reward": 0.77374267578125, + "reward_std": 0.007728885859251022, + "rewards//mean": 0.77374267578125, + "rewards//std": 0.02507800981402397, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1786, + "grad_norm": 8.3313570022583, + "kl": 2.452501432970166, + "learning_rate": 9.302902241183903e-07, + "loss": 0.2453, + "num_tokens": 7718504.0, + "reward": 0.7515869140625, + "reward_std": 0.017169872298836708, + "rewards//mean": 0.7515869140625, + "rewards//std": 0.03167125955224037, + "step": 893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1788, + "grad_norm": 11.411910057067871, + "kl": 0.8639341052621603, + "learning_rate": 9.301285153687259e-07, + "loss": 0.0864, + "num_tokens": 7727272.0, + "reward": 0.767333984375, + "reward_std": 0.005527937319129705, + "rewards//mean": 0.767333984375, + "rewards//std": 0.033479172736406326, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.179, + "grad_norm": 21.02414321899414, + "kl": 2.8881300818175077, + "learning_rate": 9.29966633363347e-07, + "loss": 0.2888, + "num_tokens": 7736008.0, + "reward": 0.736328125, + "reward_std": 0.01383928395807743, + "rewards//mean": 0.736328125, + "rewards//std": 0.04077353700995445, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1792, + "grad_norm": 13.486455917358398, + "kl": 2.517023626714945, + "learning_rate": 9.298045781674595e-07, + "loss": 0.2517, + "num_tokens": 7744680.0, + "reward": 0.74737548828125, + "reward_std": 0.012669989839196205, + "rewards//mean": 0.74737548828125, + "rewards//std": 0.03802602365612984, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1794, + "grad_norm": 9.263734817504883, + "kl": 1.875015266239643, + "learning_rate": 9.296423498463395e-07, + "loss": 0.1875, + "num_tokens": 7753328.0, + "reward": 0.787841796875, + "reward_std": 0.016101669520139694, + "rewards//mean": 0.787841796875, + "rewards//std": 0.0325920507311821, + "step": 897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1796, + "grad_norm": 8.410420417785645, + "kl": 2.492162285372615, + "learning_rate": 9.294799484653322e-07, + "loss": 0.2492, + "num_tokens": 7761912.0, + "reward": 0.75738525390625, + "reward_std": 0.013894645497202873, + "rewards//mean": 0.75738525390625, + "rewards//std": 0.026476258412003517, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1798, + "grad_norm": 8.074809074401855, + "kl": 2.27861930988729, + "learning_rate": 9.29317374089853e-07, + "loss": 0.2279, + "num_tokens": 7770584.0, + "reward": 0.75091552734375, + "reward_std": 0.018135320395231247, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.04076012596487999, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.18, + "grad_norm": 9.852544784545898, + "kl": 2.704176900908351, + "learning_rate": 9.291546267853869e-07, + "loss": 0.2704, + "num_tokens": 7779176.0, + "reward": 0.75152587890625, + "reward_std": 0.01635921373963356, + "rewards//mean": 0.75152587890625, + "rewards//std": 0.03717942163348198, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1802, + "grad_norm": 3.585476875305176, + "kl": 1.8306819722056389, + "learning_rate": 9.289917066174885e-07, + "loss": 0.1831, + "num_tokens": 7787928.0, + "reward": 0.746337890625, + "reward_std": 0.013768121600151062, + "rewards//mean": 0.746337890625, + "rewards//std": 0.031588222831487656, + "step": 901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1804, + "grad_norm": 8.426328659057617, + "kl": 2.0367767196148634, + "learning_rate": 9.288286136517819e-07, + "loss": 0.2037, + "num_tokens": 7796560.0, + "reward": 0.7659912109375, + "reward_std": 0.01338636688888073, + "rewards//mean": 0.7659912109375, + "rewards//std": 0.03998004272580147, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1806, + "grad_norm": 18.59994888305664, + "kl": 2.681055746972561, + "learning_rate": 9.28665347953961e-07, + "loss": 0.2681, + "num_tokens": 7805264.0, + "reward": 0.71942138671875, + "reward_std": 0.01512935757637024, + "rewards//mean": 0.71942138671875, + "rewards//std": 0.03471291437745094, + "step": 903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1808, + "grad_norm": 9.62825870513916, + "kl": 3.0684708058834076, + "learning_rate": 9.285019095897893e-07, + "loss": 0.3068, + "num_tokens": 7813920.0, + "reward": 0.76092529296875, + "reward_std": 0.02263367548584938, + "rewards//mean": 0.76092529296875, + "rewards//std": 0.04002521559596062, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.181, + "grad_norm": 15.713327407836914, + "kl": 3.0565064027905464, + "learning_rate": 9.283382986250996e-07, + "loss": 0.3057, + "num_tokens": 7822416.0, + "reward": 0.70458984375, + "reward_std": 0.019932560622692108, + "rewards//mean": 0.70458984375, + "rewards//std": 0.04498450830578804, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1812, + "grad_norm": 4.569295883178711, + "kl": 1.8074570018798113, + "learning_rate": 9.281745151257945e-07, + "loss": 0.1807, + "num_tokens": 7831000.0, + "reward": 0.72930908203125, + "reward_std": 0.012088514864444733, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.043668653815984726, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1814, + "grad_norm": 13.229097366333008, + "kl": 2.921083979308605, + "learning_rate": 9.280105591578458e-07, + "loss": 0.2921, + "num_tokens": 7839616.0, + "reward": 0.7606201171875, + "reward_std": 0.02092359960079193, + "rewards//mean": 0.7606201171875, + "rewards//std": 0.038079578429460526, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1816, + "grad_norm": 12.425844192504883, + "kl": 2.733391372486949, + "learning_rate": 9.278464307872951e-07, + "loss": 0.2733, + "num_tokens": 7848272.0, + "reward": 0.78143310546875, + "reward_std": 0.022195223718881607, + "rewards//mean": 0.78143310546875, + "rewards//std": 0.04661816731095314, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1818, + "grad_norm": 5.109320163726807, + "kl": 1.8693589717149734, + "learning_rate": 9.276821300802533e-07, + "loss": 0.1869, + "num_tokens": 7856824.0, + "reward": 0.755126953125, + "reward_std": 0.018127374351024628, + "rewards//mean": 0.755126953125, + "rewards//std": 0.028132878243923187, + "step": 909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.182, + "grad_norm": 8.810687065124512, + "kl": 2.1022141221910715, + "learning_rate": 9.275176571029006e-07, + "loss": 0.2102, + "num_tokens": 7865384.0, + "reward": 0.7452392578125, + "reward_std": 0.014001820236444473, + "rewards//mean": 0.7452392578125, + "rewards//std": 0.03700868785381317, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1822, + "grad_norm": 4.805698871612549, + "kl": 2.360915334895253, + "learning_rate": 9.273530119214867e-07, + "loss": 0.2361, + "num_tokens": 7874016.0, + "reward": 0.76861572265625, + "reward_std": 0.014025718905031681, + "rewards//mean": 0.76861572265625, + "rewards//std": 0.03152580186724663, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1824, + "grad_norm": 9.826226234436035, + "kl": 1.457917682826519, + "learning_rate": 9.271881946023308e-07, + "loss": 0.1458, + "num_tokens": 7882688.0, + "reward": 0.78216552734375, + "reward_std": 0.019405674189329147, + "rewards//mean": 0.78216552734375, + "rewards//std": 0.03130995109677315, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1826, + "grad_norm": 4.0873188972473145, + "kl": 2.3320354279130697, + "learning_rate": 9.270232052118212e-07, + "loss": 0.2332, + "num_tokens": 7891408.0, + "reward": 0.75823974609375, + "reward_std": 0.02385760098695755, + "rewards//mean": 0.75823974609375, + "rewards//std": 0.04004298523068428, + "step": 913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1828, + "grad_norm": 5.695456027984619, + "kl": 1.780807789415121, + "learning_rate": 9.268580438164155e-07, + "loss": 0.1781, + "num_tokens": 7900024.0, + "reward": 0.75, + "reward_std": 0.01188972033560276, + "rewards//mean": 0.75, + "rewards//std": 0.03650480881333351, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.183, + "grad_norm": 9.227212905883789, + "kl": 1.6088938806205988, + "learning_rate": 9.266927104826408e-07, + "loss": 0.1609, + "num_tokens": 7908696.0, + "reward": 0.75811767578125, + "reward_std": 0.01559586450457573, + "rewards//mean": 0.75811767578125, + "rewards//std": 0.03580343350768089, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1832, + "grad_norm": 7.26616096496582, + "kl": 1.1849891766905785, + "learning_rate": 9.265272052770935e-07, + "loss": 0.1185, + "num_tokens": 7917392.0, + "reward": 0.77154541015625, + "reward_std": 0.012334248051047325, + "rewards//mean": 0.77154541015625, + "rewards//std": 0.02179904840886593, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1834, + "grad_norm": 2.9212043285369873, + "kl": 0.9974309261888266, + "learning_rate": 9.263615282664388e-07, + "loss": 0.0997, + "num_tokens": 7926048.0, + "reward": 0.7374267578125, + "reward_std": 0.008398480713367462, + "rewards//mean": 0.7374267578125, + "rewards//std": 0.04307553917169571, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1836, + "grad_norm": 8.052578926086426, + "kl": 2.2471475172787905, + "learning_rate": 9.261956795174115e-07, + "loss": 0.2247, + "num_tokens": 7935016.0, + "reward": 0.772705078125, + "reward_std": 0.015445258468389511, + "rewards//mean": 0.772705078125, + "rewards//std": 0.04904589429497719, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1838, + "grad_norm": 5.683385848999023, + "kl": 1.0640036799013615, + "learning_rate": 9.260296590968156e-07, + "loss": 0.1064, + "num_tokens": 7943656.0, + "reward": 0.7342529296875, + "reward_std": 0.006155871320515871, + "rewards//mean": 0.7342529296875, + "rewards//std": 0.029748860746622086, + "step": 919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.184, + "grad_norm": 6.42412805557251, + "kl": 1.1672633066773415, + "learning_rate": 9.258634670715237e-07, + "loss": 0.1167, + "num_tokens": 7952288.0, + "reward": 0.73382568359375, + "reward_std": 0.007497087121009827, + "rewards//mean": 0.73382568359375, + "rewards//std": 0.026583530008792877, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1842, + "grad_norm": 5.684611797332764, + "kl": 1.2743122726678848, + "learning_rate": 9.256971035084784e-07, + "loss": 0.1274, + "num_tokens": 7960904.0, + "reward": 0.74481201171875, + "reward_std": 0.01006082259118557, + "rewards//mean": 0.74481201171875, + "rewards//std": 0.03901320695877075, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1844, + "grad_norm": 5.896803855895996, + "kl": 1.1679475717246532, + "learning_rate": 9.255305684746907e-07, + "loss": 0.1168, + "num_tokens": 7969416.0, + "reward": 0.78533935546875, + "reward_std": 0.01715112291276455, + "rewards//mean": 0.78533935546875, + "rewards//std": 0.03011671081185341, + "step": 922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1846, + "grad_norm": 4.434075355529785, + "kl": 1.6047652177512646, + "learning_rate": 9.253638620372408e-07, + "loss": 0.1605, + "num_tokens": 7978048.0, + "reward": 0.773193359375, + "reward_std": 0.017079252749681473, + "rewards//mean": 0.773193359375, + "rewards//std": 0.03856877237558365, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1848, + "grad_norm": 4.12543249130249, + "kl": 0.8505661133676767, + "learning_rate": 9.251969842632783e-07, + "loss": 0.0851, + "num_tokens": 7986648.0, + "reward": 0.76025390625, + "reward_std": 0.009053455665707588, + "rewards//mean": 0.76025390625, + "rewards//std": 0.02545374259352684, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.185, + "grad_norm": 7.275959491729736, + "kl": 1.6916682124137878, + "learning_rate": 9.250299352200212e-07, + "loss": 0.1692, + "num_tokens": 7995320.0, + "reward": 0.76251220703125, + "reward_std": 0.01787765696644783, + "rewards//mean": 0.76251220703125, + "rewards//std": 0.029016973450779915, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1852, + "grad_norm": 26.653705596923828, + "kl": 0.981909729540348, + "learning_rate": 9.248627149747572e-07, + "loss": 0.0982, + "num_tokens": 8003952.0, + "reward": 0.7724609375, + "reward_std": 0.017310626804828644, + "rewards//mean": 0.7724609375, + "rewards//std": 0.03970210626721382, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1854, + "grad_norm": 4.353677749633789, + "kl": 2.0300528090447187, + "learning_rate": 9.246953235948422e-07, + "loss": 0.203, + "num_tokens": 8012560.0, + "reward": 0.7486572265625, + "reward_std": 0.017548371106386185, + "rewards//mean": 0.7486572265625, + "rewards//std": 0.036816760897636414, + "step": 927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1856, + "grad_norm": 3.8817214965820312, + "kl": 1.986955901607871, + "learning_rate": 9.245277611477018e-07, + "loss": 0.1987, + "num_tokens": 8021216.0, + "reward": 0.767333984375, + "reward_std": 0.022250382229685783, + "rewards//mean": 0.767333984375, + "rewards//std": 0.03420204669237137, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1858, + "grad_norm": 10.012185096740723, + "kl": 1.9269897807389498, + "learning_rate": 9.2436002770083e-07, + "loss": 0.1927, + "num_tokens": 8029968.0, + "reward": 0.74853515625, + "reward_std": 0.015564601868391037, + "rewards//mean": 0.74853515625, + "rewards//std": 0.04699111357331276, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.186, + "grad_norm": 14.43603229522705, + "kl": 1.2859059367328882, + "learning_rate": 9.241921233217897e-07, + "loss": 0.1286, + "num_tokens": 8038632.0, + "reward": 0.76507568359375, + "reward_std": 0.009841764345765114, + "rewards//mean": 0.76507568359375, + "rewards//std": 0.03549472615122795, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1862, + "grad_norm": 22.928585052490234, + "kl": 2.3845520988106728, + "learning_rate": 9.240240480782129e-07, + "loss": 0.2385, + "num_tokens": 8047248.0, + "reward": 0.7322998046875, + "reward_std": 0.01873641088604927, + "rewards//mean": 0.7322998046875, + "rewards//std": 0.03428051620721817, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1864, + "grad_norm": 5.155069828033447, + "kl": 2.250894131138921, + "learning_rate": 9.238558020378003e-07, + "loss": 0.2251, + "num_tokens": 8055896.0, + "reward": 0.72259521484375, + "reward_std": 0.01779280975461006, + "rewards//mean": 0.72259521484375, + "rewards//std": 0.04486586153507233, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1866, + "grad_norm": 4.8972320556640625, + "kl": 2.1627304777503014, + "learning_rate": 9.236873852683212e-07, + "loss": 0.2163, + "num_tokens": 8064552.0, + "reward": 0.7603759765625, + "reward_std": 0.022223878651857376, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.03432464599609375, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1868, + "grad_norm": 17.42308807373047, + "kl": 2.201874129474163, + "learning_rate": 9.235187978376141e-07, + "loss": 0.2202, + "num_tokens": 8073232.0, + "reward": 0.75555419921875, + "reward_std": 0.024196408689022064, + "rewards//mean": 0.75555419921875, + "rewards//std": 0.0436471588909626, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.187, + "grad_norm": 4.307592868804932, + "kl": 1.4050643611699343, + "learning_rate": 9.233500398135858e-07, + "loss": 0.1405, + "num_tokens": 8081840.0, + "reward": 0.74835205078125, + "reward_std": 0.016355328261852264, + "rewards//mean": 0.74835205078125, + "rewards//std": 0.03643045201897621, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1872, + "grad_norm": 5.395535469055176, + "kl": 2.3265808075666428, + "learning_rate": 9.23181111264212e-07, + "loss": 0.2327, + "num_tokens": 8090544.0, + "reward": 0.7430419921875, + "reward_std": 0.02195778861641884, + "rewards//mean": 0.7430419921875, + "rewards//std": 0.03717845305800438, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1874, + "grad_norm": 17.81551742553711, + "kl": 1.4119716454297304, + "learning_rate": 9.230120122575375e-07, + "loss": 0.1412, + "num_tokens": 8099208.0, + "reward": 0.74847412109375, + "reward_std": 0.017981721088290215, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.03422145918011665, + "step": 937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1876, + "grad_norm": 9.84168529510498, + "kl": 2.9426979944109917, + "learning_rate": 9.228427428616748e-07, + "loss": 0.2943, + "num_tokens": 8107856.0, + "reward": 0.74212646484375, + "reward_std": 0.015461022034287453, + "rewards//mean": 0.74212646484375, + "rewards//std": 0.030429234728217125, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1878, + "grad_norm": 4.055408000946045, + "kl": 1.9178044013679028, + "learning_rate": 9.22673303144806e-07, + "loss": 0.1918, + "num_tokens": 8116456.0, + "reward": 0.71600341796875, + "reward_std": 0.015704046934843063, + "rewards//mean": 0.71600341796875, + "rewards//std": 0.04995502531528473, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.188, + "grad_norm": 4.922194004058838, + "kl": 1.9541615787893534, + "learning_rate": 9.22503693175181e-07, + "loss": 0.1954, + "num_tokens": 8125016.0, + "reward": 0.742919921875, + "reward_std": 0.015807168558239937, + "rewards//mean": 0.742919921875, + "rewards//std": 0.04359757527709007, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1882, + "grad_norm": 7.407509803771973, + "kl": 1.4618559051305056, + "learning_rate": 9.223339130211192e-07, + "loss": 0.1462, + "num_tokens": 8133600.0, + "reward": 0.7576904296875, + "reward_std": 0.017947595566511154, + "rewards//mean": 0.7576904296875, + "rewards//std": 0.038944318890571594, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1884, + "grad_norm": 16.472244262695312, + "kl": 1.2443057876080275, + "learning_rate": 9.221639627510075e-07, + "loss": 0.1244, + "num_tokens": 8142232.0, + "reward": 0.76678466796875, + "reward_std": 0.018832771107554436, + "rewards//mean": 0.76678466796875, + "rewards//std": 0.027889082208275795, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1886, + "grad_norm": 12.38464641571045, + "kl": 1.4479559306055307, + "learning_rate": 9.219938424333023e-07, + "loss": 0.1448, + "num_tokens": 8150976.0, + "reward": 0.7391357421875, + "reward_std": 0.022428398951888084, + "rewards//mean": 0.7391357421875, + "rewards//std": 0.03541002422571182, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1888, + "grad_norm": 4.915424346923828, + "kl": 1.8865959215909243, + "learning_rate": 9.218235521365276e-07, + "loss": 0.1887, + "num_tokens": 8159640.0, + "reward": 0.76922607421875, + "reward_std": 0.019969366490840912, + "rewards//mean": 0.76922607421875, + "rewards//std": 0.056940995156764984, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.189, + "grad_norm": 8.629729270935059, + "kl": 2.373987479135394, + "learning_rate": 9.216530919292767e-07, + "loss": 0.2374, + "num_tokens": 8168288.0, + "reward": 0.73284912109375, + "reward_std": 0.01817513443529606, + "rewards//mean": 0.73284912109375, + "rewards//std": 0.04339568316936493, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1892, + "grad_norm": 11.080283164978027, + "kl": 3.1587901078164577, + "learning_rate": 9.214824618802107e-07, + "loss": 0.3159, + "num_tokens": 8176928.0, + "reward": 0.75714111328125, + "reward_std": 0.026241883635520935, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.034221019595861435, + "step": 946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1894, + "grad_norm": 9.931846618652344, + "kl": 1.0177927780896425, + "learning_rate": 9.213116620580596e-07, + "loss": 0.1018, + "num_tokens": 8185472.0, + "reward": 0.7650146484375, + "reward_std": 0.011359244585037231, + "rewards//mean": 0.7650146484375, + "rewards//std": 0.022512100636959076, + "step": 947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1896, + "grad_norm": 4.0106520652771, + "kl": 1.7773894555866718, + "learning_rate": 9.211406925316212e-07, + "loss": 0.1777, + "num_tokens": 8194096.0, + "reward": 0.76605224609375, + "reward_std": 0.02363177202641964, + "rewards//mean": 0.76605224609375, + "rewards//std": 0.043174657970666885, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1898, + "grad_norm": 4.636829853057861, + "kl": 1.7715414706617594, + "learning_rate": 9.209695533697623e-07, + "loss": 0.1772, + "num_tokens": 8202632.0, + "reward": 0.74615478515625, + "reward_std": 0.01873505488038063, + "rewards//mean": 0.74615478515625, + "rewards//std": 0.028077874332666397, + "step": 949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.19, + "grad_norm": 6.046106815338135, + "kl": 1.8093213103711605, + "learning_rate": 9.207982446414177e-07, + "loss": 0.1809, + "num_tokens": 8211200.0, + "reward": 0.76531982421875, + "reward_std": 0.01822560280561447, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.02873282879590988, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1902, + "grad_norm": 4.0911545753479, + "kl": 2.0045978389680386, + "learning_rate": 9.206267664155906e-07, + "loss": 0.2005, + "num_tokens": 8219872.0, + "reward": 0.7706298828125, + "reward_std": 0.024520523846149445, + "rewards//mean": 0.7706298828125, + "rewards//std": 0.04281610995531082, + "step": 951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1904, + "grad_norm": 14.912626266479492, + "kl": 3.176901113241911, + "learning_rate": 9.20455118761352e-07, + "loss": 0.3177, + "num_tokens": 8228504.0, + "reward": 0.7489013671875, + "reward_std": 0.017545852810144424, + "rewards//mean": 0.7489013671875, + "rewards//std": 0.028189590200781822, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1906, + "grad_norm": 4.028141498565674, + "kl": 1.8206164687871933, + "learning_rate": 9.202833017478421e-07, + "loss": 0.1821, + "num_tokens": 8237192.0, + "reward": 0.73333740234375, + "reward_std": 0.019356444478034973, + "rewards//mean": 0.73333740234375, + "rewards//std": 0.036869507282972336, + "step": 953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1908, + "grad_norm": 5.144661903381348, + "kl": 1.748854050412774, + "learning_rate": 9.201113154442683e-07, + "loss": 0.1749, + "num_tokens": 8245880.0, + "reward": 0.72723388671875, + "reward_std": 0.011367292143404484, + "rewards//mean": 0.72723388671875, + "rewards//std": 0.039294663816690445, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.191, + "grad_norm": 4.669267177581787, + "kl": 1.543099394068122, + "learning_rate": 9.199391599199071e-07, + "loss": 0.1543, + "num_tokens": 8254472.0, + "reward": 0.7713623046875, + "reward_std": 0.013431857340037823, + "rewards//mean": 0.7713623046875, + "rewards//std": 0.02971627563238144, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1912, + "grad_norm": 9.140859603881836, + "kl": 1.8747816868126392, + "learning_rate": 9.197668352441023e-07, + "loss": 0.1875, + "num_tokens": 8263168.0, + "reward": 0.75628662109375, + "reward_std": 0.01882949098944664, + "rewards//mean": 0.75628662109375, + "rewards//std": 0.03054145723581314, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1914, + "grad_norm": 3.491973400115967, + "kl": 1.6943730656057596, + "learning_rate": 9.195943414862665e-07, + "loss": 0.1694, + "num_tokens": 8271784.0, + "reward": 0.71514892578125, + "reward_std": 0.015275931917130947, + "rewards//mean": 0.71514892578125, + "rewards//std": 0.04649265855550766, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1916, + "grad_norm": 7.8375420570373535, + "kl": 2.460888223722577, + "learning_rate": 9.194216787158804e-07, + "loss": 0.2461, + "num_tokens": 8280496.0, + "reward": 0.73760986328125, + "reward_std": 0.0214063823223114, + "rewards//mean": 0.73760986328125, + "rewards//std": 0.04603851959109306, + "step": 958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1918, + "grad_norm": 6.797418117523193, + "kl": 1.7108256202191114, + "learning_rate": 9.192488470024919e-07, + "loss": 0.1711, + "num_tokens": 8289160.0, + "reward": 0.7525634765625, + "reward_std": 0.013072742149233818, + "rewards//mean": 0.7525634765625, + "rewards//std": 0.03263916075229645, + "step": 959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.192, + "grad_norm": 5.766705513000488, + "kl": 2.004494074732065, + "learning_rate": 9.190758464157182e-07, + "loss": 0.2004, + "num_tokens": 8297768.0, + "reward": 0.72174072265625, + "reward_std": 0.01889863796532154, + "rewards//mean": 0.72174072265625, + "rewards//std": 0.03785166144371033, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1922, + "grad_norm": 6.262502670288086, + "kl": 1.8866459857672453, + "learning_rate": 9.189026770252436e-07, + "loss": 0.1887, + "num_tokens": 8306440.0, + "reward": 0.74481201171875, + "reward_std": 0.023079926148056984, + "rewards//mean": 0.74481201171875, + "rewards//std": 0.04129328951239586, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1924, + "grad_norm": 4.815186977386475, + "kl": 1.9248689897358418, + "learning_rate": 9.187293389008208e-07, + "loss": 0.1925, + "num_tokens": 8315072.0, + "reward": 0.74517822265625, + "reward_std": 0.015207601711153984, + "rewards//mean": 0.74517822265625, + "rewards//std": 0.03300460800528526, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1926, + "grad_norm": 5.999852657318115, + "kl": 1.9934442956000566, + "learning_rate": 9.185558321122704e-07, + "loss": 0.1993, + "num_tokens": 8323776.0, + "reward": 0.77001953125, + "reward_std": 0.024884480983018875, + "rewards//mean": 0.77001953125, + "rewards//std": 0.04075274243950844, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1928, + "grad_norm": 7.562851428985596, + "kl": 1.7993676457554102, + "learning_rate": 9.183821567294808e-07, + "loss": 0.1799, + "num_tokens": 8332408.0, + "reward": 0.744873046875, + "reward_std": 0.02207602560520172, + "rewards//mean": 0.744873046875, + "rewards//std": 0.033355962485075, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.193, + "grad_norm": 7.516448974609375, + "kl": 2.334638250991702, + "learning_rate": 9.182083128224086e-07, + "loss": 0.2335, + "num_tokens": 8341096.0, + "reward": 0.74591064453125, + "reward_std": 0.017029182985424995, + "rewards//mean": 0.74591064453125, + "rewards//std": 0.032056719064712524, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1932, + "grad_norm": 4.7977824211120605, + "kl": 2.1065255533903837, + "learning_rate": 9.180343004610779e-07, + "loss": 0.2107, + "num_tokens": 8349760.0, + "reward": 0.76287841796875, + "reward_std": 0.026503656059503555, + "rewards//mean": 0.76287841796875, + "rewards//std": 0.03985845670104027, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1934, + "grad_norm": 5.4156670570373535, + "kl": 1.567462394014001, + "learning_rate": 9.178601197155811e-07, + "loss": 0.1567, + "num_tokens": 8358440.0, + "reward": 0.745361328125, + "reward_std": 0.013103803619742393, + "rewards//mean": 0.745361328125, + "rewards//std": 0.03417370840907097, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1936, + "grad_norm": 9.369543075561523, + "kl": 1.647336831316352, + "learning_rate": 9.176857706560779e-07, + "loss": 0.1647, + "num_tokens": 8367064.0, + "reward": 0.7431640625, + "reward_std": 0.017528044059872627, + "rewards//mean": 0.7431640625, + "rewards//std": 0.03703179210424423, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1938, + "grad_norm": 3.257838487625122, + "kl": 1.6657201033085585, + "learning_rate": 9.175112533527963e-07, + "loss": 0.1666, + "num_tokens": 8375712.0, + "reward": 0.72479248046875, + "reward_std": 0.011116349138319492, + "rewards//mean": 0.72479248046875, + "rewards//std": 0.028279326856136322, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.194, + "grad_norm": 9.640281677246094, + "kl": 2.476398589089513, + "learning_rate": 9.173365678760317e-07, + "loss": 0.2476, + "num_tokens": 8384464.0, + "reward": 0.75396728515625, + "reward_std": 0.0185113325715065, + "rewards//mean": 0.75396728515625, + "rewards//std": 0.04257799685001373, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1942, + "grad_norm": 8.67280101776123, + "kl": 2.546649331226945, + "learning_rate": 9.171617142961476e-07, + "loss": 0.2547, + "num_tokens": 8393096.0, + "reward": 0.7593994140625, + "reward_std": 0.015201020054519176, + "rewards//mean": 0.7593994140625, + "rewards//std": 0.034672170877456665, + "step": 971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1944, + "grad_norm": 4.121150970458984, + "kl": 2.2070589400827885, + "learning_rate": 9.169866926835747e-07, + "loss": 0.2207, + "num_tokens": 8401712.0, + "reward": 0.739501953125, + "reward_std": 0.020942389965057373, + "rewards//mean": 0.739501953125, + "rewards//std": 0.03589511662721634, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1946, + "grad_norm": 12.351953506469727, + "kl": 3.461782954633236, + "learning_rate": 9.16811503108812e-07, + "loss": 0.3462, + "num_tokens": 8410496.0, + "reward": 0.73712158203125, + "reward_std": 0.019555550068616867, + "rewards//mean": 0.73712158203125, + "rewards//std": 0.037645939737558365, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1948, + "grad_norm": 10.169455528259277, + "kl": 1.4090617876499891, + "learning_rate": 9.166361456424257e-07, + "loss": 0.1409, + "num_tokens": 8419064.0, + "reward": 0.76898193359375, + "reward_std": 0.013797442428767681, + "rewards//mean": 0.76898193359375, + "rewards//std": 0.03554246202111244, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.195, + "grad_norm": 6.749905586242676, + "kl": 2.9098270386457443, + "learning_rate": 9.164606203550497e-07, + "loss": 0.291, + "num_tokens": 8427840.0, + "reward": 0.764892578125, + "reward_std": 0.013962388038635254, + "rewards//mean": 0.764892578125, + "rewards//std": 0.03703097254037857, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1952, + "grad_norm": 9.379358291625977, + "kl": 3.743213150650263, + "learning_rate": 9.162849273173856e-07, + "loss": 0.3743, + "num_tokens": 8436616.0, + "reward": 0.7681884765625, + "reward_std": 0.027786388993263245, + "rewards//mean": 0.7681884765625, + "rewards//std": 0.04356614127755165, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1954, + "grad_norm": 6.3227858543396, + "kl": 2.949258007109165, + "learning_rate": 9.161090666002027e-07, + "loss": 0.2949, + "num_tokens": 8445216.0, + "reward": 0.76934814453125, + "reward_std": 0.023682300001382828, + "rewards//mean": 0.76934814453125, + "rewards//std": 0.050551459193229675, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1956, + "grad_norm": 3.9359500408172607, + "kl": 2.529670547693968, + "learning_rate": 9.159330382743373e-07, + "loss": 0.253, + "num_tokens": 8453896.0, + "reward": 0.7742919921875, + "reward_std": 0.023785840719938278, + "rewards//mean": 0.7742919921875, + "rewards//std": 0.04157046228647232, + "step": 978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1958, + "grad_norm": 7.5341081619262695, + "kl": 2.6453016586601734, + "learning_rate": 9.157568424106941e-07, + "loss": 0.2645, + "num_tokens": 8462552.0, + "reward": 0.77020263671875, + "reward_std": 0.015745025128126144, + "rewards//mean": 0.77020263671875, + "rewards//std": 0.03167765215039253, + "step": 979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.196, + "grad_norm": 3.5417768955230713, + "kl": 2.399245113134384, + "learning_rate": 9.155804790802443e-07, + "loss": 0.2399, + "num_tokens": 8471232.0, + "reward": 0.75244140625, + "reward_std": 0.024859551340341568, + "rewards//mean": 0.75244140625, + "rewards//std": 0.03393634408712387, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1962, + "grad_norm": 3.2599830627441406, + "kl": 2.5723649710416794, + "learning_rate": 9.154039483540272e-07, + "loss": 0.2572, + "num_tokens": 8479856.0, + "reward": 0.7513427734375, + "reward_std": 0.01924026757478714, + "rewards//mean": 0.7513427734375, + "rewards//std": 0.03221340849995613, + "step": 981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1964, + "grad_norm": 11.227360725402832, + "kl": 2.852146787568927, + "learning_rate": 9.152272503031495e-07, + "loss": 0.2852, + "num_tokens": 8488528.0, + "reward": 0.7349853515625, + "reward_std": 0.02431170456111431, + "rewards//mean": 0.7349853515625, + "rewards//std": 0.05037766695022583, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1966, + "grad_norm": 5.53578519821167, + "kl": 2.192291097715497, + "learning_rate": 9.150503849987851e-07, + "loss": 0.2192, + "num_tokens": 8497112.0, + "reward": 0.7540283203125, + "reward_std": 0.01734818145632744, + "rewards//mean": 0.7540283203125, + "rewards//std": 0.04089198634028435, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1968, + "grad_norm": 5.688729763031006, + "kl": 2.156856844201684, + "learning_rate": 9.14873352512175e-07, + "loss": 0.2157, + "num_tokens": 8505752.0, + "reward": 0.76312255859375, + "reward_std": 0.021846525371074677, + "rewards//mean": 0.76312255859375, + "rewards//std": 0.036679331213235855, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.197, + "grad_norm": 3.1646580696105957, + "kl": 1.9843303374946117, + "learning_rate": 9.146961529146284e-07, + "loss": 0.1984, + "num_tokens": 8514376.0, + "reward": 0.74755859375, + "reward_std": 0.015789909288287163, + "rewards//mean": 0.74755859375, + "rewards//std": 0.04050833731889725, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1972, + "grad_norm": 8.397634506225586, + "kl": 1.9697171039879322, + "learning_rate": 9.145187862775208e-07, + "loss": 0.197, + "num_tokens": 8522960.0, + "reward": 0.7210693359375, + "reward_std": 0.011889157816767693, + "rewards//mean": 0.7210693359375, + "rewards//std": 0.040503665804862976, + "step": 986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1974, + "grad_norm": 2.9615468978881836, + "kl": 2.1096298955380917, + "learning_rate": 9.143412526722958e-07, + "loss": 0.211, + "num_tokens": 8531496.0, + "reward": 0.71539306640625, + "reward_std": 0.011229978874325752, + "rewards//mean": 0.71539306640625, + "rewards//std": 0.035827524960041046, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1976, + "grad_norm": 5.069641590118408, + "kl": 3.2557414285838604, + "learning_rate": 9.141635521704636e-07, + "loss": 0.3256, + "num_tokens": 8540232.0, + "reward": 0.74566650390625, + "reward_std": 0.02486158348619938, + "rewards//mean": 0.74566650390625, + "rewards//std": 0.04670478776097298, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1978, + "grad_norm": 3.38765549659729, + "kl": 2.23440158367157, + "learning_rate": 9.139856848436023e-07, + "loss": 0.2234, + "num_tokens": 8548920.0, + "reward": 0.7530517578125, + "reward_std": 0.01635737717151642, + "rewards//mean": 0.7530517578125, + "rewards//std": 0.03826991468667984, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.198, + "grad_norm": 10.754985809326172, + "kl": 2.968583047389984, + "learning_rate": 9.138076507633565e-07, + "loss": 0.2969, + "num_tokens": 8557688.0, + "reward": 0.76715087890625, + "reward_std": 0.02133917436003685, + "rewards//mean": 0.76715087890625, + "rewards//std": 0.041763149201869965, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1982, + "grad_norm": 4.791354656219482, + "kl": 2.0338217727839947, + "learning_rate": 9.136294500014385e-07, + "loss": 0.2034, + "num_tokens": 8566352.0, + "reward": 0.74432373046875, + "reward_std": 0.02426404505968094, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.04016867280006409, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1984, + "grad_norm": 6.58543062210083, + "kl": 2.2268502674996853, + "learning_rate": 9.134510826296276e-07, + "loss": 0.2227, + "num_tokens": 8575064.0, + "reward": 0.73406982421875, + "reward_std": 0.014979375526309013, + "rewards//mean": 0.73406982421875, + "rewards//std": 0.03583724424242973, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1986, + "grad_norm": 7.094622611999512, + "kl": 1.471730774268508, + "learning_rate": 9.1327254871977e-07, + "loss": 0.1472, + "num_tokens": 8583664.0, + "reward": 0.7391357421875, + "reward_std": 0.016242019832134247, + "rewards//mean": 0.7391357421875, + "rewards//std": 0.03274473547935486, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1988, + "grad_norm": 5.462141990661621, + "kl": 1.950741233304143, + "learning_rate": 9.130938483437791e-07, + "loss": 0.1951, + "num_tokens": 8592344.0, + "reward": 0.7523193359375, + "reward_std": 0.017355646938085556, + "rewards//mean": 0.7523193359375, + "rewards//std": 0.03225473314523697, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.199, + "grad_norm": 3.3840034008026123, + "kl": 2.0983388610184193, + "learning_rate": 9.129149815736357e-07, + "loss": 0.2098, + "num_tokens": 8601024.0, + "reward": 0.72772216796875, + "reward_std": 0.018127836287021637, + "rewards//mean": 0.72772216796875, + "rewards//std": 0.04553665593266487, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1992, + "grad_norm": 6.309532642364502, + "kl": 1.3534683883190155, + "learning_rate": 9.12735948481387e-07, + "loss": 0.1353, + "num_tokens": 8609688.0, + "reward": 0.7733154296875, + "reward_std": 0.016825225204229355, + "rewards//mean": 0.7733154296875, + "rewards//std": 0.03628499060869217, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1994, + "grad_norm": 6.512838363647461, + "kl": 1.433094348758459, + "learning_rate": 9.125567491391475e-07, + "loss": 0.1433, + "num_tokens": 8618400.0, + "reward": 0.7647705078125, + "reward_std": 0.016744688153266907, + "rewards//mean": 0.7647705078125, + "rewards//std": 0.03630167245864868, + "step": 997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1996, + "grad_norm": 6.351593494415283, + "kl": 1.7570994179695845, + "learning_rate": 9.123773836190989e-07, + "loss": 0.1757, + "num_tokens": 8627216.0, + "reward": 0.76605224609375, + "reward_std": 0.014316737651824951, + "rewards//mean": 0.76605224609375, + "rewards//std": 0.03454899787902832, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.1998, + "grad_norm": 3.224513053894043, + "kl": 1.7378581073135138, + "learning_rate": 9.121978519934895e-07, + "loss": 0.1738, + "num_tokens": 8635816.0, + "reward": 0.71142578125, + "reward_std": 0.009469851851463318, + "rewards//mean": 0.71142578125, + "rewards//std": 0.03517002612352371, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2, + "grad_norm": 3.5807652473449707, + "kl": 2.5934614650905132, + "learning_rate": 9.120181543346346e-07, + "loss": 0.2593, + "num_tokens": 8644496.0, + "reward": 0.74176025390625, + "reward_std": 0.027724089100956917, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.04741503298282623, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2002, + "grad_norm": 5.258052825927734, + "kl": 0.9394328705966473, + "learning_rate": 9.118382907149163e-07, + "loss": 0.0939, + "num_tokens": 8653088.0, + "reward": 0.75982666015625, + "reward_std": 0.01254335604608059, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.033013783395290375, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2004, + "grad_norm": 3.4401493072509766, + "kl": 1.7041137032210827, + "learning_rate": 9.116582612067838e-07, + "loss": 0.1704, + "num_tokens": 8661776.0, + "reward": 0.7347412109375, + "reward_std": 0.01181069016456604, + "rewards//mean": 0.7347412109375, + "rewards//std": 0.03357917442917824, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2006, + "grad_norm": 4.4879608154296875, + "kl": 1.4810854904353619, + "learning_rate": 9.11478065882753e-07, + "loss": 0.1481, + "num_tokens": 8670552.0, + "reward": 0.7659912109375, + "reward_std": 0.019971122965216637, + "rewards//mean": 0.7659912109375, + "rewards//std": 0.038880523294210434, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2008, + "grad_norm": 3.7709860801696777, + "kl": 2.517193468287587, + "learning_rate": 9.112977048154064e-07, + "loss": 0.2517, + "num_tokens": 8679200.0, + "reward": 0.7618408203125, + "reward_std": 0.023658381775021553, + "rewards//mean": 0.7618408203125, + "rewards//std": 0.03793619945645332, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.201, + "grad_norm": 3.0166332721710205, + "kl": 1.6556002162396908, + "learning_rate": 9.111171780773936e-07, + "loss": 0.1656, + "num_tokens": 8687768.0, + "reward": 0.74688720703125, + "reward_std": 0.01199406012892723, + "rewards//mean": 0.74688720703125, + "rewards//std": 0.03949524834752083, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2012, + "grad_norm": 10.680081367492676, + "kl": 2.370729196816683, + "learning_rate": 9.109364857414305e-07, + "loss": 0.2371, + "num_tokens": 8696448.0, + "reward": 0.7003173828125, + "reward_std": 0.029017120599746704, + "rewards//mean": 0.7003173828125, + "rewards//std": 0.050186194479465485, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2014, + "grad_norm": 3.0061981678009033, + "kl": 1.3387518040835857, + "learning_rate": 9.107556278803002e-07, + "loss": 0.1339, + "num_tokens": 8705088.0, + "reward": 0.75897216796875, + "reward_std": 0.008711813017725945, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.02459406480193138, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2016, + "grad_norm": 4.439276695251465, + "kl": 2.059834100306034, + "learning_rate": 9.10574604566852e-07, + "loss": 0.206, + "num_tokens": 8713744.0, + "reward": 0.70880126953125, + "reward_std": 0.013922769576311111, + "rewards//mean": 0.70880126953125, + "rewards//std": 0.03169580549001694, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2018, + "grad_norm": 3.0243308544158936, + "kl": 1.7723627705127, + "learning_rate": 9.103934158740022e-07, + "loss": 0.1772, + "num_tokens": 8722456.0, + "reward": 0.776611328125, + "reward_std": 0.016336709260940552, + "rewards//mean": 0.776611328125, + "rewards//std": 0.030835507437586784, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.202, + "grad_norm": 4.09769344329834, + "kl": 2.093655541539192, + "learning_rate": 9.102120618747336e-07, + "loss": 0.2094, + "num_tokens": 8731136.0, + "reward": 0.75537109375, + "reward_std": 0.01660916581749916, + "rewards//mean": 0.75537109375, + "rewards//std": 0.03684491664171219, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2022, + "grad_norm": 5.781892776489258, + "kl": 1.4492920245975256, + "learning_rate": 9.100305426420956e-07, + "loss": 0.1449, + "num_tokens": 8739872.0, + "reward": 0.76702880859375, + "reward_std": 0.01709713786840439, + "rewards//mean": 0.76702880859375, + "rewards//std": 0.027030762284994125, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2024, + "grad_norm": 5.448172569274902, + "kl": 1.5353701952844858, + "learning_rate": 9.098488582492039e-07, + "loss": 0.1535, + "num_tokens": 8748488.0, + "reward": 0.76947021484375, + "reward_std": 0.017640870064496994, + "rewards//mean": 0.76947021484375, + "rewards//std": 0.03162360563874245, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2026, + "grad_norm": 4.6841936111450195, + "kl": 1.4715809114277363, + "learning_rate": 9.096670087692411e-07, + "loss": 0.1472, + "num_tokens": 8757128.0, + "reward": 0.76654052734375, + "reward_std": 0.017565356567502022, + "rewards//mean": 0.76654052734375, + "rewards//std": 0.036096084862947464, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2028, + "grad_norm": 3.725461959838867, + "kl": 1.6743265595287085, + "learning_rate": 9.094849942754563e-07, + "loss": 0.1674, + "num_tokens": 8765896.0, + "reward": 0.72198486328125, + "reward_std": 0.011678006500005722, + "rewards//mean": 0.72198486328125, + "rewards//std": 0.044230084866285324, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.203, + "grad_norm": 6.629767894744873, + "kl": 2.057219333946705, + "learning_rate": 9.093028148411648e-07, + "loss": 0.2057, + "num_tokens": 8774560.0, + "reward": 0.77020263671875, + "reward_std": 0.015884580090641975, + "rewards//mean": 0.77020263671875, + "rewards//std": 0.031578097492456436, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2032, + "grad_norm": 3.7993109226226807, + "kl": 2.157150162383914, + "learning_rate": 9.091204705397483e-07, + "loss": 0.2157, + "num_tokens": 8783240.0, + "reward": 0.75518798828125, + "reward_std": 0.01860562339425087, + "rewards//mean": 0.75518798828125, + "rewards//std": 0.033516623079776764, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2034, + "grad_norm": 7.185744762420654, + "kl": 1.5592702366411686, + "learning_rate": 9.089379614446553e-07, + "loss": 0.1559, + "num_tokens": 8791904.0, + "reward": 0.76171875, + "reward_std": 0.014105849899351597, + "rewards//mean": 0.76171875, + "rewards//std": 0.03168105334043503, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2036, + "grad_norm": 3.291555166244507, + "kl": 1.2108670603483915, + "learning_rate": 9.087552876294002e-07, + "loss": 0.1211, + "num_tokens": 8800440.0, + "reward": 0.753173828125, + "reward_std": 0.010028994642198086, + "rewards//mean": 0.753173828125, + "rewards//std": 0.02954385057091713, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2038, + "grad_norm": 3.1324665546417236, + "kl": 1.5204577669501305, + "learning_rate": 9.085724491675642e-07, + "loss": 0.152, + "num_tokens": 8809040.0, + "reward": 0.7515869140625, + "reward_std": 0.010083088651299477, + "rewards//mean": 0.7515869140625, + "rewards//std": 0.03368719294667244, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.204, + "grad_norm": 4.665890693664551, + "kl": 1.2971952389925718, + "learning_rate": 9.083894461327945e-07, + "loss": 0.1297, + "num_tokens": 8817712.0, + "reward": 0.76116943359375, + "reward_std": 0.008947715163230896, + "rewards//mean": 0.76116943359375, + "rewards//std": 0.039517853409051895, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2042, + "grad_norm": 9.229904174804688, + "kl": 1.8431557640433311, + "learning_rate": 9.082062785988048e-07, + "loss": 0.1843, + "num_tokens": 8826296.0, + "reward": 0.7672119140625, + "reward_std": 0.012240275740623474, + "rewards//mean": 0.7672119140625, + "rewards//std": 0.028397180140018463, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2044, + "grad_norm": 4.694721221923828, + "kl": 1.4546211212873459, + "learning_rate": 9.080229466393749e-07, + "loss": 0.1455, + "num_tokens": 8835024.0, + "reward": 0.77215576171875, + "reward_std": 0.01088004745543003, + "rewards//mean": 0.77215576171875, + "rewards//std": 0.02693820185959339, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2046, + "grad_norm": 3.1893560886383057, + "kl": 1.9206580389291048, + "learning_rate": 9.078394503283508e-07, + "loss": 0.1921, + "num_tokens": 8843712.0, + "reward": 0.73529052734375, + "reward_std": 0.009692528285086155, + "rewards//mean": 0.73529052734375, + "rewards//std": 0.02680017240345478, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2048, + "grad_norm": 2.479396343231201, + "kl": 1.483644813299179, + "learning_rate": 9.076557897396451e-07, + "loss": 0.1484, + "num_tokens": 8852368.0, + "reward": 0.7568359375, + "reward_std": 0.01286611519753933, + "rewards//mean": 0.7568359375, + "rewards//std": 0.029911501333117485, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.205, + "grad_norm": 2.945749044418335, + "kl": 0.9529800787568092, + "learning_rate": 9.074719649472357e-07, + "loss": 0.0953, + "num_tokens": 8860928.0, + "reward": 0.736328125, + "reward_std": 0.005859419237822294, + "rewards//mean": 0.736328125, + "rewards//std": 0.028476230800151825, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2052, + "grad_norm": 6.230681419372559, + "kl": 1.6063654609024525, + "learning_rate": 9.072879760251679e-07, + "loss": 0.1606, + "num_tokens": 8869624.0, + "reward": 0.74969482421875, + "reward_std": 0.012421256862580776, + "rewards//mean": 0.74969482421875, + "rewards//std": 0.031919483095407486, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2054, + "grad_norm": 1.848879337310791, + "kl": 1.7765672486275434, + "learning_rate": 9.071038230475519e-07, + "loss": 0.1777, + "num_tokens": 8878224.0, + "reward": 0.7886962890625, + "reward_std": 0.012133300304412842, + "rewards//mean": 0.7886962890625, + "rewards//std": 0.030070748180150986, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2056, + "grad_norm": 10.26515007019043, + "kl": 1.5696597695350647, + "learning_rate": 9.069195060885646e-07, + "loss": 0.157, + "num_tokens": 8886832.0, + "reward": 0.7757568359375, + "reward_std": 0.012210506945848465, + "rewards//mean": 0.7757568359375, + "rewards//std": 0.027906784787774086, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2058, + "grad_norm": 7.927394390106201, + "kl": 2.835561953485012, + "learning_rate": 9.067350252224489e-07, + "loss": 0.2836, + "num_tokens": 8895536.0, + "reward": 0.74676513671875, + "reward_std": 0.015318479388952255, + "rewards//mean": 0.74676513671875, + "rewards//std": 0.02599097415804863, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.206, + "grad_norm": 3.9940266609191895, + "kl": 1.416461167857051, + "learning_rate": 9.065503805235137e-07, + "loss": 0.1416, + "num_tokens": 8904104.0, + "reward": 0.7501220703125, + "reward_std": 0.009496974758803844, + "rewards//mean": 0.7501220703125, + "rewards//std": 0.031350355595350266, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2062, + "grad_norm": 2.798651695251465, + "kl": 1.3074674978852272, + "learning_rate": 9.06365572066134e-07, + "loss": 0.1307, + "num_tokens": 8912784.0, + "reward": 0.76141357421875, + "reward_std": 0.007983425632119179, + "rewards//mean": 0.76141357421875, + "rewards//std": 0.020591329783201218, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2064, + "grad_norm": 12.279181480407715, + "kl": 3.5611584540456533, + "learning_rate": 9.061805999247503e-07, + "loss": 0.3561, + "num_tokens": 8921536.0, + "reward": 0.77001953125, + "reward_std": 0.021788431331515312, + "rewards//mean": 0.77001953125, + "rewards//std": 0.04297438636422157, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2066, + "grad_norm": 8.397802352905273, + "kl": 2.812007764354348, + "learning_rate": 9.059954641738697e-07, + "loss": 0.2812, + "num_tokens": 8930136.0, + "reward": 0.72967529296875, + "reward_std": 0.011716771870851517, + "rewards//mean": 0.72967529296875, + "rewards//std": 0.03453541174530983, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2068, + "grad_norm": 8.99998950958252, + "kl": 2.567307475954294, + "learning_rate": 9.058101648880645e-07, + "loss": 0.2567, + "num_tokens": 8938864.0, + "reward": 0.777099609375, + "reward_std": 0.010260752402245998, + "rewards//mean": 0.777099609375, + "rewards//std": 0.026161137968301773, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.207, + "grad_norm": 2.230818033218384, + "kl": 1.468862995505333, + "learning_rate": 9.056247021419734e-07, + "loss": 0.1469, + "num_tokens": 8947568.0, + "reward": 0.7591552734375, + "reward_std": 0.010790163651108742, + "rewards//mean": 0.7591552734375, + "rewards//std": 0.028401443734765053, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2072, + "grad_norm": 7.6791558265686035, + "kl": 2.5379223749041557, + "learning_rate": 9.054390760103009e-07, + "loss": 0.2538, + "num_tokens": 8956272.0, + "reward": 0.75360107421875, + "reward_std": 0.012005605734884739, + "rewards//mean": 0.75360107421875, + "rewards//std": 0.02732429839670658, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2074, + "grad_norm": 2.4393842220306396, + "kl": 1.5688249077647924, + "learning_rate": 9.052532865678171e-07, + "loss": 0.1569, + "num_tokens": 8964968.0, + "reward": 0.7652587890625, + "reward_std": 0.00900060124695301, + "rewards//mean": 0.7652587890625, + "rewards//std": 0.03030342608690262, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2076, + "grad_norm": 2.1573307514190674, + "kl": 1.6273485254496336, + "learning_rate": 9.050673338893577e-07, + "loss": 0.1627, + "num_tokens": 8973680.0, + "reward": 0.73858642578125, + "reward_std": 0.009875812567770481, + "rewards//mean": 0.73858642578125, + "rewards//std": 0.031805459409952164, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2078, + "grad_norm": 2.5634870529174805, + "kl": 2.4672775603830814, + "learning_rate": 9.04881218049825e-07, + "loss": 0.2467, + "num_tokens": 8982352.0, + "reward": 0.7288818359375, + "reward_std": 0.014227893203496933, + "rewards//mean": 0.7288818359375, + "rewards//std": 0.032494135200977325, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.208, + "grad_norm": 2.005077838897705, + "kl": 0.9139247164130211, + "learning_rate": 9.046949391241858e-07, + "loss": 0.0914, + "num_tokens": 8991008.0, + "reward": 0.72296142578125, + "reward_std": 0.004710361361503601, + "rewards//mean": 0.72296142578125, + "rewards//std": 0.031182534992694855, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2082, + "grad_norm": 8.661205291748047, + "kl": 2.7401093523949385, + "learning_rate": 9.045084971874737e-07, + "loss": 0.274, + "num_tokens": 8999736.0, + "reward": 0.7806396484375, + "reward_std": 0.01274215430021286, + "rewards//mean": 0.7806396484375, + "rewards//std": 0.030058663338422775, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2084, + "grad_norm": 13.921006202697754, + "kl": 2.183920970186591, + "learning_rate": 9.043218923147873e-07, + "loss": 0.2184, + "num_tokens": 9008384.0, + "reward": 0.73284912109375, + "reward_std": 0.008773128502070904, + "rewards//mean": 0.73284912109375, + "rewards//std": 0.04187897592782974, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2086, + "grad_norm": 5.314465045928955, + "kl": 1.8124338928610086, + "learning_rate": 9.04135124581291e-07, + "loss": 0.1812, + "num_tokens": 9017096.0, + "reward": 0.75048828125, + "reward_std": 0.01586126536130905, + "rewards//mean": 0.75048828125, + "rewards//std": 0.027511531487107277, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2088, + "grad_norm": 5.20868444442749, + "kl": 3.3300900626927614, + "learning_rate": 9.039481940622146e-07, + "loss": 0.333, + "num_tokens": 9025688.0, + "reward": 0.75067138671875, + "reward_std": 0.017955124378204346, + "rewards//mean": 0.75067138671875, + "rewards//std": 0.02610255777835846, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.209, + "grad_norm": 5.6909356117248535, + "kl": 2.0218575745821, + "learning_rate": 9.037611008328543e-07, + "loss": 0.2022, + "num_tokens": 9034344.0, + "reward": 0.7503662109375, + "reward_std": 0.013804212212562561, + "rewards//mean": 0.7503662109375, + "rewards//std": 0.0436466783285141, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2092, + "grad_norm": 4.027197360992432, + "kl": 2.3590909838676453, + "learning_rate": 9.035738449685706e-07, + "loss": 0.2359, + "num_tokens": 9043032.0, + "reward": 0.74188232421875, + "reward_std": 0.01404221449047327, + "rewards//mean": 0.74188232421875, + "rewards//std": 0.04413003474473953, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2094, + "grad_norm": 2.821146011352539, + "kl": 1.6312313880771399, + "learning_rate": 9.033864265447906e-07, + "loss": 0.1631, + "num_tokens": 9051672.0, + "reward": 0.7440185546875, + "reward_std": 0.01578584499657154, + "rewards//mean": 0.7440185546875, + "rewards//std": 0.03487589955329895, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2096, + "grad_norm": 2.958963632583618, + "kl": 1.6689205151051283, + "learning_rate": 9.031988456370061e-07, + "loss": 0.1669, + "num_tokens": 9060280.0, + "reward": 0.71722412109375, + "reward_std": 0.013462478294968605, + "rewards//mean": 0.71722412109375, + "rewards//std": 0.02801850624382496, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2098, + "grad_norm": 10.404927253723145, + "kl": 2.314605975523591, + "learning_rate": 9.030111023207749e-07, + "loss": 0.2315, + "num_tokens": 9068992.0, + "reward": 0.73309326171875, + "reward_std": 0.010471160523593426, + "rewards//mean": 0.73309326171875, + "rewards//std": 0.04128742218017578, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.21, + "grad_norm": 4.192626476287842, + "kl": 2.7768554501235485, + "learning_rate": 9.028231966717198e-07, + "loss": 0.2777, + "num_tokens": 9077648.0, + "reward": 0.75421142578125, + "reward_std": 0.016651127487421036, + "rewards//mean": 0.75421142578125, + "rewards//std": 0.03971616178750992, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2102, + "grad_norm": 6.3300604820251465, + "kl": 2.7328280713409185, + "learning_rate": 9.026351287655293e-07, + "loss": 0.2733, + "num_tokens": 9086304.0, + "reward": 0.7623291015625, + "reward_std": 0.018146753311157227, + "rewards//mean": 0.7623291015625, + "rewards//std": 0.036233220249414444, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2104, + "grad_norm": 5.513199806213379, + "kl": 1.0714191440492868, + "learning_rate": 9.02446898677957e-07, + "loss": 0.1071, + "num_tokens": 9094952.0, + "reward": 0.78021240234375, + "reward_std": 0.006924469955265522, + "rewards//mean": 0.78021240234375, + "rewards//std": 0.021868381649255753, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2106, + "grad_norm": 4.735969066619873, + "kl": 1.3821017984300852, + "learning_rate": 9.02258506484822e-07, + "loss": 0.1382, + "num_tokens": 9103560.0, + "reward": 0.763427734375, + "reward_std": 0.012097623199224472, + "rewards//mean": 0.763427734375, + "rewards//std": 0.031218014657497406, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2108, + "grad_norm": 5.088438034057617, + "kl": 1.687829440459609, + "learning_rate": 9.02069952262009e-07, + "loss": 0.1688, + "num_tokens": 9112192.0, + "reward": 0.7362060546875, + "reward_std": 0.008893121033906937, + "rewards//mean": 0.7362060546875, + "rewards//std": 0.04226670414209366, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.211, + "grad_norm": 9.628849983215332, + "kl": 3.9030132219195366, + "learning_rate": 9.018812360854671e-07, + "loss": 0.3903, + "num_tokens": 9120840.0, + "reward": 0.74041748046875, + "reward_std": 0.02466106228530407, + "rewards//mean": 0.74041748046875, + "rewards//std": 0.03548832982778549, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2112, + "grad_norm": 9.033496856689453, + "kl": 1.4859026093035936, + "learning_rate": 9.016923580312113e-07, + "loss": 0.1486, + "num_tokens": 9129488.0, + "reward": 0.76214599609375, + "reward_std": 0.012279321439564228, + "rewards//mean": 0.76214599609375, + "rewards//std": 0.029521232470870018, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2114, + "grad_norm": 3.2272932529449463, + "kl": 2.0986061356961727, + "learning_rate": 9.015033181753218e-07, + "loss": 0.2099, + "num_tokens": 9138040.0, + "reward": 0.745361328125, + "reward_std": 0.020388251170516014, + "rewards//mean": 0.745361328125, + "rewards//std": 0.041694995015859604, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2116, + "grad_norm": 2.655343532562256, + "kl": 2.4686303231865168, + "learning_rate": 9.013141165939438e-07, + "loss": 0.2469, + "num_tokens": 9146648.0, + "reward": 0.771484375, + "reward_std": 0.02447417750954628, + "rewards//mean": 0.771484375, + "rewards//std": 0.03653796762228012, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2118, + "grad_norm": 4.1452317237854, + "kl": 1.6574193220585585, + "learning_rate": 9.011247533632875e-07, + "loss": 0.1657, + "num_tokens": 9155216.0, + "reward": 0.74169921875, + "reward_std": 0.009907124564051628, + "rewards//mean": 0.74169921875, + "rewards//std": 0.030205607414245605, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.212, + "grad_norm": 2.804823875427246, + "kl": 1.7262969482690096, + "learning_rate": 9.009352285596285e-07, + "loss": 0.1726, + "num_tokens": 9163848.0, + "reward": 0.7520751953125, + "reward_std": 0.01055437233299017, + "rewards//mean": 0.7520751953125, + "rewards//std": 0.022412359714508057, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2122, + "grad_norm": 11.700467109680176, + "kl": 2.8435444589704275, + "learning_rate": 9.007455422593075e-07, + "loss": 0.2844, + "num_tokens": 9172520.0, + "reward": 0.76397705078125, + "reward_std": 0.015800870954990387, + "rewards//mean": 0.76397705078125, + "rewards//std": 0.042162489145994186, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2124, + "grad_norm": 6.289818286895752, + "kl": 1.9879839420318604, + "learning_rate": 9.0055569453873e-07, + "loss": 0.1988, + "num_tokens": 9181096.0, + "reward": 0.75042724609375, + "reward_std": 0.01895938068628311, + "rewards//mean": 0.75042724609375, + "rewards//std": 0.03131284937262535, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2126, + "grad_norm": 2.4594168663024902, + "kl": 2.264555150642991, + "learning_rate": 9.003656854743666e-07, + "loss": 0.2265, + "num_tokens": 9189752.0, + "reward": 0.71282958984375, + "reward_std": 0.01371677964925766, + "rewards//mean": 0.71282958984375, + "rewards//std": 0.04367593303322792, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2128, + "grad_norm": 5.355173110961914, + "kl": 1.163853295147419, + "learning_rate": 9.00175515142753e-07, + "loss": 0.1164, + "num_tokens": 9198392.0, + "reward": 0.7357177734375, + "reward_std": 0.013257784768939018, + "rewards//mean": 0.7357177734375, + "rewards//std": 0.036836493760347366, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.213, + "grad_norm": 7.627799034118652, + "kl": 2.075035708025098, + "learning_rate": 8.9998518362049e-07, + "loss": 0.2075, + "num_tokens": 9207120.0, + "reward": 0.74114990234375, + "reward_std": 0.016323495656251907, + "rewards//mean": 0.74114990234375, + "rewards//std": 0.03852081298828125, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2132, + "grad_norm": 2.5370049476623535, + "kl": 1.1113354787230492, + "learning_rate": 8.997946909842424e-07, + "loss": 0.1111, + "num_tokens": 9215808.0, + "reward": 0.77667236328125, + "reward_std": 0.007781160529702902, + "rewards//mean": 0.77667236328125, + "rewards//std": 0.024024909362196922, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2134, + "grad_norm": 3.582096815109253, + "kl": 2.111095203086734, + "learning_rate": 8.996040373107414e-07, + "loss": 0.2111, + "num_tokens": 9224600.0, + "reward": 0.7498779296875, + "reward_std": 0.013851397670805454, + "rewards//mean": 0.7498779296875, + "rewards//std": 0.03584172949194908, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2136, + "grad_norm": 6.173175811767578, + "kl": 2.086049735546112, + "learning_rate": 8.994132226767819e-07, + "loss": 0.2086, + "num_tokens": 9233272.0, + "reward": 0.76031494140625, + "reward_std": 0.018188945949077606, + "rewards//mean": 0.76031494140625, + "rewards//std": 0.04558815062046051, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2138, + "grad_norm": 11.23176383972168, + "kl": 2.007845725864172, + "learning_rate": 8.992222471592239e-07, + "loss": 0.2008, + "num_tokens": 9241864.0, + "reward": 0.7625732421875, + "reward_std": 0.01386364083737135, + "rewards//mean": 0.7625732421875, + "rewards//std": 0.03949088603258133, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.214, + "grad_norm": 4.25261926651001, + "kl": 1.6897999960929155, + "learning_rate": 8.990311108349926e-07, + "loss": 0.169, + "num_tokens": 9250568.0, + "reward": 0.75799560546875, + "reward_std": 0.015520873479545116, + "rewards//mean": 0.75799560546875, + "rewards//std": 0.04118390008807182, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2142, + "grad_norm": 4.411772727966309, + "kl": 2.255348764359951, + "learning_rate": 8.988398137810776e-07, + "loss": 0.2255, + "num_tokens": 9259280.0, + "reward": 0.74029541015625, + "reward_std": 0.011341812089085579, + "rewards//mean": 0.74029541015625, + "rewards//std": 0.019697854295372963, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2144, + "grad_norm": 6.922074317932129, + "kl": 1.9375630132853985, + "learning_rate": 8.986483560745333e-07, + "loss": 0.1938, + "num_tokens": 9267928.0, + "reward": 0.74188232421875, + "reward_std": 0.010026191361248493, + "rewards//mean": 0.74188232421875, + "rewards//std": 0.02257748320698738, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2146, + "grad_norm": 5.813230991363525, + "kl": 2.6822686679661274, + "learning_rate": 8.984567377924789e-07, + "loss": 0.2682, + "num_tokens": 9276600.0, + "reward": 0.7318115234375, + "reward_std": 0.020570648834109306, + "rewards//mean": 0.7318115234375, + "rewards//std": 0.03699395805597305, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2148, + "grad_norm": 4.24099588394165, + "kl": 1.593309286981821, + "learning_rate": 8.982649590120981e-07, + "loss": 0.1593, + "num_tokens": 9285168.0, + "reward": 0.767578125, + "reward_std": 0.016599806025624275, + "rewards//mean": 0.767578125, + "rewards//std": 0.04181753098964691, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.215, + "grad_norm": 30.643138885498047, + "kl": 1.741103883832693, + "learning_rate": 8.980730198106394e-07, + "loss": 0.1741, + "num_tokens": 9293880.0, + "reward": 0.71392822265625, + "reward_std": 0.007119806483387947, + "rewards//mean": 0.71392822265625, + "rewards//std": 0.03858049958944321, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2152, + "grad_norm": 33.59050750732422, + "kl": 2.6876457687467337, + "learning_rate": 8.97880920265416e-07, + "loss": 0.2688, + "num_tokens": 9302696.0, + "reward": 0.75274658203125, + "reward_std": 0.012646064162254333, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.035677216947078705, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2154, + "grad_norm": 24.11741065979004, + "kl": 3.5284326169639826, + "learning_rate": 8.976886604538055e-07, + "loss": 0.3528, + "num_tokens": 9311360.0, + "reward": 0.75982666015625, + "reward_std": 0.022194216027855873, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.04469548910856247, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2156, + "grad_norm": 11.08321475982666, + "kl": 2.338799251243472, + "learning_rate": 8.974962404532501e-07, + "loss": 0.2339, + "num_tokens": 9320040.0, + "reward": 0.76312255859375, + "reward_std": 0.012554142624139786, + "rewards//mean": 0.76312255859375, + "rewards//std": 0.03452971577644348, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2158, + "grad_norm": 27.886621475219727, + "kl": 1.4472415745258331, + "learning_rate": 8.973036603412566e-07, + "loss": 0.1447, + "num_tokens": 9328624.0, + "reward": 0.7734375, + "reward_std": 0.01772221550345421, + "rewards//mean": 0.7734375, + "rewards//std": 0.03383985534310341, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.216, + "grad_norm": 8.311849594116211, + "kl": 1.8753036558628082, + "learning_rate": 8.971109201953962e-07, + "loss": 0.1875, + "num_tokens": 9337216.0, + "reward": 0.759521484375, + "reward_std": 0.022103890776634216, + "rewards//mean": 0.759521484375, + "rewards//std": 0.033681128174066544, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2162, + "grad_norm": 4.862329006195068, + "kl": 1.6421557180583477, + "learning_rate": 8.969180200933047e-07, + "loss": 0.1642, + "num_tokens": 9345800.0, + "reward": 0.75360107421875, + "reward_std": 0.0184025838971138, + "rewards//mean": 0.75360107421875, + "rewards//std": 0.03356897458434105, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2164, + "grad_norm": 3.538501739501953, + "kl": 1.4810181353241205, + "learning_rate": 8.967249601126821e-07, + "loss": 0.1481, + "num_tokens": 9354368.0, + "reward": 0.75732421875, + "reward_std": 0.016151435673236847, + "rewards//mean": 0.75732421875, + "rewards//std": 0.04505982622504234, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2166, + "grad_norm": 26.427112579345703, + "kl": 1.828379400074482, + "learning_rate": 8.96531740331293e-07, + "loss": 0.1828, + "num_tokens": 9363000.0, + "reward": 0.76458740234375, + "reward_std": 0.01369639951735735, + "rewards//mean": 0.76458740234375, + "rewards//std": 0.03298625722527504, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2168, + "grad_norm": 12.101463317871094, + "kl": 1.2811105847358704, + "learning_rate": 8.963383608269663e-07, + "loss": 0.1281, + "num_tokens": 9371672.0, + "reward": 0.75506591796875, + "reward_std": 0.009559770114719868, + "rewards//mean": 0.75506591796875, + "rewards//std": 0.02435656450688839, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.217, + "grad_norm": 12.138514518737793, + "kl": 2.161335153505206, + "learning_rate": 8.961448216775953e-07, + "loss": 0.2161, + "num_tokens": 9380344.0, + "reward": 0.73681640625, + "reward_std": 0.009760679677128792, + "rewards//mean": 0.73681640625, + "rewards//std": 0.026083486154675484, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2172, + "grad_norm": 24.755420684814453, + "kl": 2.8088847771286964, + "learning_rate": 8.959511229611375e-07, + "loss": 0.2809, + "num_tokens": 9389040.0, + "reward": 0.75531005859375, + "reward_std": 0.014149040915071964, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.02670624479651451, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2174, + "grad_norm": 10.738611221313477, + "kl": 2.352067621424794, + "learning_rate": 8.957572647556147e-07, + "loss": 0.2352, + "num_tokens": 9397592.0, + "reward": 0.7427978515625, + "reward_std": 0.01083272136747837, + "rewards//mean": 0.7427978515625, + "rewards//std": 0.03283890709280968, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2176, + "grad_norm": 36.97765350341797, + "kl": 4.06044471822679, + "learning_rate": 8.95563247139113e-07, + "loss": 0.406, + "num_tokens": 9406272.0, + "reward": 0.7430419921875, + "reward_std": 0.015127220191061497, + "rewards//mean": 0.7430419921875, + "rewards//std": 0.055607471615076065, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2178, + "grad_norm": 13.865893363952637, + "kl": 2.522465394809842, + "learning_rate": 8.953690701897827e-07, + "loss": 0.2522, + "num_tokens": 9414848.0, + "reward": 0.7244873046875, + "reward_std": 0.017161235213279724, + "rewards//mean": 0.7244873046875, + "rewards//std": 0.03700541704893112, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.218, + "grad_norm": 6.7566680908203125, + "kl": 1.7745242714881897, + "learning_rate": 8.951747339858382e-07, + "loss": 0.1775, + "num_tokens": 9423448.0, + "reward": 0.7322998046875, + "reward_std": 0.008848993107676506, + "rewards//mean": 0.7322998046875, + "rewards//std": 0.03039320930838585, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2182, + "grad_norm": 32.137081146240234, + "kl": 2.067481989040971, + "learning_rate": 8.94980238605558e-07, + "loss": 0.2067, + "num_tokens": 9432040.0, + "reward": 0.748046875, + "reward_std": 0.01435694471001625, + "rewards//mean": 0.748046875, + "rewards//std": 0.03106343187391758, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2184, + "grad_norm": 27.685312271118164, + "kl": 1.8618167992681265, + "learning_rate": 8.947855841272851e-07, + "loss": 0.1862, + "num_tokens": 9440632.0, + "reward": 0.75518798828125, + "reward_std": 0.010974636301398277, + "rewards//mean": 0.75518798828125, + "rewards//std": 0.02978210709989071, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2186, + "grad_norm": 7.296536445617676, + "kl": 1.5944556891918182, + "learning_rate": 8.94590770629426e-07, + "loss": 0.1594, + "num_tokens": 9449312.0, + "reward": 0.6986083984375, + "reward_std": 0.010842295363545418, + "rewards//mean": 0.6986083984375, + "rewards//std": 0.03731987997889519, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2188, + "grad_norm": 4.455016613006592, + "kl": 1.0433367993682623, + "learning_rate": 8.943957981904517e-07, + "loss": 0.1043, + "num_tokens": 9458032.0, + "reward": 0.76824951171875, + "reward_std": 0.00847709272056818, + "rewards//mean": 0.76824951171875, + "rewards//std": 0.031102817505598068, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.219, + "grad_norm": 3.9581363201141357, + "kl": 0.7808334045112133, + "learning_rate": 8.942006668888971e-07, + "loss": 0.0781, + "num_tokens": 9466552.0, + "reward": 0.756103515625, + "reward_std": 0.006937914527952671, + "rewards//mean": 0.756103515625, + "rewards//std": 0.024068424478173256, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2192, + "grad_norm": 3.87412166595459, + "kl": 1.3768600430339575, + "learning_rate": 8.940053768033608e-07, + "loss": 0.1377, + "num_tokens": 9475248.0, + "reward": 0.75006103515625, + "reward_std": 0.008184343576431274, + "rewards//mean": 0.75006103515625, + "rewards//std": 0.036851439625024796, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2194, + "grad_norm": 5.8469061851501465, + "kl": 0.9254527110606432, + "learning_rate": 8.938099280125062e-07, + "loss": 0.0925, + "num_tokens": 9483984.0, + "reward": 0.77789306640625, + "reward_std": 0.01542261429131031, + "rewards//mean": 0.77789306640625, + "rewards//std": 0.036958906799554825, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2196, + "grad_norm": 8.163247108459473, + "kl": 1.3382260501384735, + "learning_rate": 8.936143205950595e-07, + "loss": 0.1338, + "num_tokens": 9492560.0, + "reward": 0.74798583984375, + "reward_std": 0.01950734481215477, + "rewards//mean": 0.74798583984375, + "rewards//std": 0.03747384622693062, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2198, + "grad_norm": 4.726160526275635, + "kl": 0.6422359738498926, + "learning_rate": 8.934185546298115e-07, + "loss": 0.0642, + "num_tokens": 9501136.0, + "reward": 0.70245361328125, + "reward_std": 0.0035011344589293003, + "rewards//mean": 0.70245361328125, + "rewards//std": 0.038333695381879807, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.22, + "grad_norm": 3.5903127193450928, + "kl": 1.405467739328742, + "learning_rate": 8.932226301956169e-07, + "loss": 0.1405, + "num_tokens": 9509816.0, + "reward": 0.7440185546875, + "reward_std": 0.014332575723528862, + "rewards//mean": 0.7440185546875, + "rewards//std": 0.034470751881599426, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2202, + "grad_norm": 4.708259582519531, + "kl": 1.6381273418664932, + "learning_rate": 8.930265473713937e-07, + "loss": 0.1638, + "num_tokens": 9518472.0, + "reward": 0.75689697265625, + "reward_std": 0.021110452711582184, + "rewards//mean": 0.75689697265625, + "rewards//std": 0.04539315402507782, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2204, + "grad_norm": 3.713083267211914, + "kl": 0.80934071354568, + "learning_rate": 8.928303062361243e-07, + "loss": 0.0809, + "num_tokens": 9527136.0, + "reward": 0.73992919921875, + "reward_std": 0.006100708618760109, + "rewards//mean": 0.73992919921875, + "rewards//std": 0.03321719914674759, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2206, + "grad_norm": 5.256967544555664, + "kl": 1.395573116838932, + "learning_rate": 8.926339068688545e-07, + "loss": 0.1396, + "num_tokens": 9535776.0, + "reward": 0.74481201171875, + "reward_std": 0.014384103938937187, + "rewards//mean": 0.74481201171875, + "rewards//std": 0.041369467973709106, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2208, + "grad_norm": 9.163372993469238, + "kl": 1.5889676082879305, + "learning_rate": 8.924373493486941e-07, + "loss": 0.1589, + "num_tokens": 9544424.0, + "reward": 0.74603271484375, + "reward_std": 0.018748531118035316, + "rewards//mean": 0.74603271484375, + "rewards//std": 0.04023231193423271, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.221, + "grad_norm": 4.666876316070557, + "kl": 1.8292632326483727, + "learning_rate": 8.922406337548161e-07, + "loss": 0.1829, + "num_tokens": 9553032.0, + "reward": 0.71136474609375, + "reward_std": 0.013210605829954147, + "rewards//mean": 0.71136474609375, + "rewards//std": 0.029798876494169235, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2212, + "grad_norm": 3.660369634628296, + "kl": 2.339859602972865, + "learning_rate": 8.920437601664579e-07, + "loss": 0.234, + "num_tokens": 9561712.0, + "reward": 0.77392578125, + "reward_std": 0.023278575390577316, + "rewards//mean": 0.77392578125, + "rewards//std": 0.041721854358911514, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2214, + "grad_norm": 3.374110698699951, + "kl": 1.1332755852490664, + "learning_rate": 8.918467286629198e-07, + "loss": 0.1133, + "num_tokens": 9570320.0, + "reward": 0.73443603515625, + "reward_std": 0.00900747999548912, + "rewards//mean": 0.73443603515625, + "rewards//std": 0.03303944692015648, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2216, + "grad_norm": 8.907243728637695, + "kl": 1.6807291693985462, + "learning_rate": 8.916495393235665e-07, + "loss": 0.1681, + "num_tokens": 9578848.0, + "reward": 0.74151611328125, + "reward_std": 0.008039504289627075, + "rewards//mean": 0.74151611328125, + "rewards//std": 0.03710075840353966, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2218, + "grad_norm": 5.270492076873779, + "kl": 1.2160240355879068, + "learning_rate": 8.914521922278255e-07, + "loss": 0.1216, + "num_tokens": 9587480.0, + "reward": 0.73681640625, + "reward_std": 0.008157813921570778, + "rewards//mean": 0.73681640625, + "rewards//std": 0.03306881710886955, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.222, + "grad_norm": 3.2378830909729004, + "kl": 1.6577156893908978, + "learning_rate": 8.912546874551882e-07, + "loss": 0.1658, + "num_tokens": 9596152.0, + "reward": 0.75250244140625, + "reward_std": 0.011570584028959274, + "rewards//mean": 0.75250244140625, + "rewards//std": 0.03636099398136139, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2222, + "grad_norm": 3.7634778022766113, + "kl": 1.8262662645429373, + "learning_rate": 8.910570250852096e-07, + "loss": 0.1826, + "num_tokens": 9604800.0, + "reward": 0.74334716796875, + "reward_std": 0.015612797811627388, + "rewards//mean": 0.74334716796875, + "rewards//std": 0.03957182541489601, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2224, + "grad_norm": 5.38132381439209, + "kl": 1.1248016580939293, + "learning_rate": 8.908592051975081e-07, + "loss": 0.1125, + "num_tokens": 9613480.0, + "reward": 0.76190185546875, + "reward_std": 0.0102156363427639, + "rewards//mean": 0.76190185546875, + "rewards//std": 0.027528515085577965, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2226, + "grad_norm": 3.525296926498413, + "kl": 1.8597209546715021, + "learning_rate": 8.906612278717655e-07, + "loss": 0.186, + "num_tokens": 9622088.0, + "reward": 0.7320556640625, + "reward_std": 0.016202254220843315, + "rewards//mean": 0.7320556640625, + "rewards//std": 0.043121904134750366, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2228, + "grad_norm": 6.389703273773193, + "kl": 0.8898295853286982, + "learning_rate": 8.90463093187727e-07, + "loss": 0.089, + "num_tokens": 9630704.0, + "reward": 0.762939453125, + "reward_std": 0.008746866136789322, + "rewards//mean": 0.762939453125, + "rewards//std": 0.017946293577551842, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.223, + "grad_norm": 4.210634231567383, + "kl": 0.8167678322643042, + "learning_rate": 8.902648012252012e-07, + "loss": 0.0817, + "num_tokens": 9639360.0, + "reward": 0.7841796875, + "reward_std": 0.007580064702779055, + "rewards//mean": 0.7841796875, + "rewards//std": 0.023602265864610672, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2232, + "grad_norm": 7.35414457321167, + "kl": 1.5044339783489704, + "learning_rate": 8.900663520640603e-07, + "loss": 0.1504, + "num_tokens": 9647976.0, + "reward": 0.732666015625, + "reward_std": 0.012166472151875496, + "rewards//mean": 0.732666015625, + "rewards//std": 0.035268884152173996, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2234, + "grad_norm": 4.93861722946167, + "kl": 1.047438146546483, + "learning_rate": 8.898677457842394e-07, + "loss": 0.1047, + "num_tokens": 9656608.0, + "reward": 0.78271484375, + "reward_std": 0.009562061168253422, + "rewards//mean": 0.78271484375, + "rewards//std": 0.028843844309449196, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2236, + "grad_norm": 12.119524002075195, + "kl": 1.4735262338072062, + "learning_rate": 8.896689824657371e-07, + "loss": 0.1474, + "num_tokens": 9665184.0, + "reward": 0.7591552734375, + "reward_std": 0.008323341608047485, + "rewards//mean": 0.7591552734375, + "rewards//std": 0.024902725592255592, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2238, + "grad_norm": 16.558944702148438, + "kl": 1.3315905779600143, + "learning_rate": 8.894700621886152e-07, + "loss": 0.1332, + "num_tokens": 9673856.0, + "reward": 0.75994873046875, + "reward_std": 0.012154627591371536, + "rewards//mean": 0.75994873046875, + "rewards//std": 0.03866319730877876, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.224, + "grad_norm": 3.9123525619506836, + "kl": 1.5547582395374775, + "learning_rate": 8.892709850329989e-07, + "loss": 0.1555, + "num_tokens": 9682448.0, + "reward": 0.73846435546875, + "reward_std": 0.009352664463222027, + "rewards//mean": 0.73846435546875, + "rewards//std": 0.03554629534482956, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2242, + "grad_norm": 2.711242198944092, + "kl": 1.0546619202941656, + "learning_rate": 8.890717510790762e-07, + "loss": 0.1055, + "num_tokens": 9691064.0, + "reward": 0.74713134765625, + "reward_std": 0.00806482508778572, + "rewards//mean": 0.74713134765625, + "rewards//std": 0.026173792779445648, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2244, + "grad_norm": 1.9054088592529297, + "kl": 1.5269436184316874, + "learning_rate": 8.888723604070989e-07, + "loss": 0.1527, + "num_tokens": 9699704.0, + "reward": 0.76837158203125, + "reward_std": 0.011166717857122421, + "rewards//mean": 0.76837158203125, + "rewards//std": 0.03374438360333443, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2246, + "grad_norm": 8.689998626708984, + "kl": 1.329160338267684, + "learning_rate": 8.886728130973813e-07, + "loss": 0.1329, + "num_tokens": 9708296.0, + "reward": 0.772216796875, + "reward_std": 0.012064231559634209, + "rewards//mean": 0.772216796875, + "rewards//std": 0.03167245164513588, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2248, + "grad_norm": 3.48347806930542, + "kl": 1.6822141632437706, + "learning_rate": 8.884731092303011e-07, + "loss": 0.1682, + "num_tokens": 9717040.0, + "reward": 0.7603759765625, + "reward_std": 0.010618474334478378, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.028883440420031548, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.225, + "grad_norm": 7.801677703857422, + "kl": 1.9016104098409414, + "learning_rate": 8.882732488862987e-07, + "loss": 0.1902, + "num_tokens": 9725680.0, + "reward": 0.73150634765625, + "reward_std": 0.011720804497599602, + "rewards//mean": 0.73150634765625, + "rewards//std": 0.034381214529275894, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2252, + "grad_norm": 8.066940307617188, + "kl": 2.135226909071207, + "learning_rate": 8.880732321458784e-07, + "loss": 0.2135, + "num_tokens": 9734240.0, + "reward": 0.7755126953125, + "reward_std": 0.011359816417098045, + "rewards//mean": 0.7755126953125, + "rewards//std": 0.025581583380699158, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2254, + "grad_norm": 1.7799408435821533, + "kl": 0.9675129484385252, + "learning_rate": 8.878730590896065e-07, + "loss": 0.0968, + "num_tokens": 9742928.0, + "reward": 0.7236328125, + "reward_std": 0.004742627497762442, + "rewards//mean": 0.7236328125, + "rewards//std": 0.03784691169857979, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2256, + "grad_norm": 2.759489059448242, + "kl": 0.9137209337204695, + "learning_rate": 8.876727297981127e-07, + "loss": 0.0914, + "num_tokens": 9751496.0, + "reward": 0.7816162109375, + "reward_std": 0.007736141327768564, + "rewards//mean": 0.7816162109375, + "rewards//std": 0.024604294449090958, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2258, + "grad_norm": 2.2090072631835938, + "kl": 1.8953998424112797, + "learning_rate": 8.874722443520898e-07, + "loss": 0.1895, + "num_tokens": 9760136.0, + "reward": 0.76629638671875, + "reward_std": 0.017574312165379524, + "rewards//mean": 0.76629638671875, + "rewards//std": 0.03510836884379387, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.226, + "grad_norm": 3.8313753604888916, + "kl": 3.062817746773362, + "learning_rate": 8.872716028322931e-07, + "loss": 0.3063, + "num_tokens": 9768864.0, + "reward": 0.76080322265625, + "reward_std": 0.021771468222141266, + "rewards//mean": 0.76080322265625, + "rewards//std": 0.039504826068878174, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2262, + "grad_norm": 10.609853744506836, + "kl": 2.610361535102129, + "learning_rate": 8.870708053195413e-07, + "loss": 0.261, + "num_tokens": 9777504.0, + "reward": 0.73651123046875, + "reward_std": 0.013166049495339394, + "rewards//mean": 0.73651123046875, + "rewards//std": 0.04736807569861412, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2264, + "grad_norm": 6.4367289543151855, + "kl": 1.8171419892460108, + "learning_rate": 8.868698518947151e-07, + "loss": 0.1817, + "num_tokens": 9786088.0, + "reward": 0.7452392578125, + "reward_std": 0.008618181571364403, + "rewards//mean": 0.7452392578125, + "rewards//std": 0.024631349369883537, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2266, + "grad_norm": 2.1947710514068604, + "kl": 1.0404817126691341, + "learning_rate": 8.866687426387591e-07, + "loss": 0.104, + "num_tokens": 9794704.0, + "reward": 0.7593994140625, + "reward_std": 0.00733649218454957, + "rewards//mean": 0.7593994140625, + "rewards//std": 0.03120904229581356, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2268, + "grad_norm": 9.567975044250488, + "kl": 2.9677893854677677, + "learning_rate": 8.864674776326797e-07, + "loss": 0.2968, + "num_tokens": 9803360.0, + "reward": 0.7906494140625, + "reward_std": 0.015689987689256668, + "rewards//mean": 0.7906494140625, + "rewards//std": 0.02821749821305275, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.227, + "grad_norm": 1.8530478477478027, + "kl": 1.2280470710247755, + "learning_rate": 8.862660569575464e-07, + "loss": 0.1228, + "num_tokens": 9812040.0, + "reward": 0.76318359375, + "reward_std": 0.007310510613024235, + "rewards//mean": 0.76318359375, + "rewards//std": 0.025205127894878387, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2272, + "grad_norm": 11.196718215942383, + "kl": 3.1078683994710445, + "learning_rate": 8.860644806944917e-07, + "loss": 0.3108, + "num_tokens": 9820664.0, + "reward": 0.72747802734375, + "reward_std": 0.008620038628578186, + "rewards//mean": 0.72747802734375, + "rewards//std": 0.03392379358410835, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2274, + "grad_norm": 6.063144207000732, + "kl": 3.298464583232999, + "learning_rate": 8.858627489247104e-07, + "loss": 0.3298, + "num_tokens": 9829264.0, + "reward": 0.75042724609375, + "reward_std": 0.017651591449975967, + "rewards//mean": 0.75042724609375, + "rewards//std": 0.036085180938243866, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2276, + "grad_norm": 27.15064811706543, + "kl": 3.2940468601882458, + "learning_rate": 8.856608617294599e-07, + "loss": 0.3294, + "num_tokens": 9837976.0, + "reward": 0.7381591796875, + "reward_std": 0.014517206698656082, + "rewards//mean": 0.7381591796875, + "rewards//std": 0.039622530341148376, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2278, + "grad_norm": 2.138314962387085, + "kl": 1.4599057100713253, + "learning_rate": 8.854588191900604e-07, + "loss": 0.146, + "num_tokens": 9846624.0, + "reward": 0.7435302734375, + "reward_std": 0.010030900128185749, + "rewards//mean": 0.7435302734375, + "rewards//std": 0.027693333104252815, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.228, + "grad_norm": 3.1714651584625244, + "kl": 1.9531475640833378, + "learning_rate": 8.852566213878946e-07, + "loss": 0.1953, + "num_tokens": 9855208.0, + "reward": 0.7593994140625, + "reward_std": 0.014453301206231117, + "rewards//mean": 0.7593994140625, + "rewards//std": 0.0359075553715229, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2282, + "grad_norm": 2.9468600749969482, + "kl": 2.570426480844617, + "learning_rate": 8.850542684044078e-07, + "loss": 0.257, + "num_tokens": 9863816.0, + "reward": 0.7938232421875, + "reward_std": 0.016855968162417412, + "rewards//mean": 0.7938232421875, + "rewards//std": 0.032752130180597305, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2284, + "grad_norm": 4.921816825866699, + "kl": 1.8722131662070751, + "learning_rate": 8.848517603211078e-07, + "loss": 0.1872, + "num_tokens": 9872392.0, + "reward": 0.758544921875, + "reward_std": 0.009364070370793343, + "rewards//mean": 0.758544921875, + "rewards//std": 0.03168009966611862, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2286, + "grad_norm": 5.602573394775391, + "kl": 2.316109459847212, + "learning_rate": 8.846490972195646e-07, + "loss": 0.2316, + "num_tokens": 9881024.0, + "reward": 0.74676513671875, + "reward_std": 0.01074385829269886, + "rewards//mean": 0.74676513671875, + "rewards//std": 0.038679640740156174, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2288, + "grad_norm": 46.09404754638672, + "kl": 2.3865990675985813, + "learning_rate": 8.844462791814112e-07, + "loss": 0.2387, + "num_tokens": 9889648.0, + "reward": 0.74615478515625, + "reward_std": 0.007766093127429485, + "rewards//mean": 0.74615478515625, + "rewards//std": 0.029687926173210144, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.229, + "grad_norm": 3.9606568813323975, + "kl": 1.375477273017168, + "learning_rate": 8.842433062883425e-07, + "loss": 0.1375, + "num_tokens": 9898224.0, + "reward": 0.763427734375, + "reward_std": 0.011231972835958004, + "rewards//mean": 0.763427734375, + "rewards//std": 0.03136507794260979, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2292, + "grad_norm": 10.372222900390625, + "kl": 2.170891275629401, + "learning_rate": 8.840401786221159e-07, + "loss": 0.2171, + "num_tokens": 9906824.0, + "reward": 0.73858642578125, + "reward_std": 0.01340460404753685, + "rewards//mean": 0.73858642578125, + "rewards//std": 0.03745647519826889, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2294, + "grad_norm": 3.0244295597076416, + "kl": 1.6726904660463333, + "learning_rate": 8.838368962645513e-07, + "loss": 0.1673, + "num_tokens": 9915560.0, + "reward": 0.7413330078125, + "reward_std": 0.010628901422023773, + "rewards//mean": 0.7413330078125, + "rewards//std": 0.030895095318555832, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2296, + "grad_norm": 6.284627437591553, + "kl": 2.278836591169238, + "learning_rate": 8.836334592975308e-07, + "loss": 0.2279, + "num_tokens": 9924232.0, + "reward": 0.70709228515625, + "reward_std": 0.014068431220948696, + "rewards//mean": 0.70709228515625, + "rewards//std": 0.0403585359454155, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2298, + "grad_norm": 7.906750202178955, + "kl": 2.891849149018526, + "learning_rate": 8.834298678029988e-07, + "loss": 0.2892, + "num_tokens": 9932832.0, + "reward": 0.752197265625, + "reward_std": 0.025658084079623222, + "rewards//mean": 0.752197265625, + "rewards//std": 0.042965229600667953, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.23, + "grad_norm": 3.2445545196533203, + "kl": 1.7269501145929098, + "learning_rate": 8.83226121862962e-07, + "loss": 0.1727, + "num_tokens": 9941384.0, + "reward": 0.7686767578125, + "reward_std": 0.023529019206762314, + "rewards//mean": 0.7686767578125, + "rewards//std": 0.039650026708841324, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2302, + "grad_norm": 3.0770299434661865, + "kl": 1.5299071036279202, + "learning_rate": 8.83022221559489e-07, + "loss": 0.153, + "num_tokens": 9949968.0, + "reward": 0.7440185546875, + "reward_std": 0.015604786574840546, + "rewards//mean": 0.7440185546875, + "rewards//std": 0.0306471548974514, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2304, + "grad_norm": 3.2209055423736572, + "kl": 1.4628514032810926, + "learning_rate": 8.82818166974711e-07, + "loss": 0.1463, + "num_tokens": 9958640.0, + "reward": 0.7689208984375, + "reward_std": 0.009194627404212952, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.029513860121369362, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2306, + "grad_norm": 2.5282833576202393, + "kl": 1.9720770809799433, + "learning_rate": 8.826139581908211e-07, + "loss": 0.1972, + "num_tokens": 9967248.0, + "reward": 0.76751708984375, + "reward_std": 0.013500608503818512, + "rewards//mean": 0.76751708984375, + "rewards//std": 0.033320486545562744, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2308, + "grad_norm": 7.129243850708008, + "kl": 1.0265081953257322, + "learning_rate": 8.824095952900746e-07, + "loss": 0.1027, + "num_tokens": 9975864.0, + "reward": 0.76123046875, + "reward_std": 0.006395334843546152, + "rewards//mean": 0.76123046875, + "rewards//std": 0.028919318690896034, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.231, + "grad_norm": 1.9215022325515747, + "kl": 1.4127433262765408, + "learning_rate": 8.822050783547889e-07, + "loss": 0.1413, + "num_tokens": 9984496.0, + "reward": 0.77972412109375, + "reward_std": 0.013265897519886494, + "rewards//mean": 0.77972412109375, + "rewards//std": 0.035527125000953674, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2312, + "grad_norm": 4.380608558654785, + "kl": 1.1788012031465769, + "learning_rate": 8.820004074673433e-07, + "loss": 0.1179, + "num_tokens": 9993096.0, + "reward": 0.7825927734375, + "reward_std": 0.014705965295433998, + "rewards//mean": 0.7825927734375, + "rewards//std": 0.028933709487318993, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2314, + "grad_norm": 1.7834618091583252, + "kl": 1.9318585358560085, + "learning_rate": 8.817955827101792e-07, + "loss": 0.1932, + "num_tokens": 10001696.0, + "reward": 0.7471923828125, + "reward_std": 0.012094835750758648, + "rewards//mean": 0.7471923828125, + "rewards//std": 0.03956441581249237, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2316, + "grad_norm": 3.228813886642456, + "kl": 1.569538813084364, + "learning_rate": 8.815906041658001e-07, + "loss": 0.157, + "num_tokens": 10010312.0, + "reward": 0.756103515625, + "reward_std": 0.010903415270149708, + "rewards//mean": 0.756103515625, + "rewards//std": 0.029967118054628372, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2318, + "grad_norm": 3.7323246002197266, + "kl": 1.4793742503970861, + "learning_rate": 8.813854719167712e-07, + "loss": 0.1479, + "num_tokens": 10018944.0, + "reward": 0.75079345703125, + "reward_std": 0.009869174100458622, + "rewards//mean": 0.75079345703125, + "rewards//std": 0.029381943866610527, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.232, + "grad_norm": 6.703781604766846, + "kl": 1.9111527763307095, + "learning_rate": 8.8118018604572e-07, + "loss": 0.1911, + "num_tokens": 10027520.0, + "reward": 0.75616455078125, + "reward_std": 0.01446323562413454, + "rewards//mean": 0.75616455078125, + "rewards//std": 0.037213198840618134, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2322, + "grad_norm": 4.319847583770752, + "kl": 0.7886298522353172, + "learning_rate": 8.809747466353355e-07, + "loss": 0.0789, + "num_tokens": 10036120.0, + "reward": 0.77099609375, + "reward_std": 0.005149394273757935, + "rewards//mean": 0.77099609375, + "rewards//std": 0.026176178827881813, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2324, + "grad_norm": 4.049602508544922, + "kl": 2.0440140943974257, + "learning_rate": 8.807691537683684e-07, + "loss": 0.2044, + "num_tokens": 10044704.0, + "reward": 0.7349853515625, + "reward_std": 0.01736447401344776, + "rewards//mean": 0.7349853515625, + "rewards//std": 0.03697431460022926, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2326, + "grad_norm": 3.7227745056152344, + "kl": 1.336763946339488, + "learning_rate": 8.805634075276317e-07, + "loss": 0.1337, + "num_tokens": 10053336.0, + "reward": 0.75250244140625, + "reward_std": 0.01371496170759201, + "rewards//mean": 0.75250244140625, + "rewards//std": 0.03393539413809776, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2328, + "grad_norm": 2.9432053565979004, + "kl": 1.2952180672436953, + "learning_rate": 8.80357507996e-07, + "loss": 0.1295, + "num_tokens": 10061936.0, + "reward": 0.7847900390625, + "reward_std": 0.011655289679765701, + "rewards//mean": 0.7847900390625, + "rewards//std": 0.024436375126242638, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.233, + "grad_norm": 6.470463275909424, + "kl": 0.8546751197427511, + "learning_rate": 8.801514552564095e-07, + "loss": 0.0855, + "num_tokens": 10070680.0, + "reward": 0.775390625, + "reward_std": 0.009649467654526234, + "rewards//mean": 0.775390625, + "rewards//std": 0.029643084853887558, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2332, + "grad_norm": 11.972208023071289, + "kl": 2.483306748792529, + "learning_rate": 8.799452493918585e-07, + "loss": 0.2483, + "num_tokens": 10079240.0, + "reward": 0.7391357421875, + "reward_std": 0.010093813762068748, + "rewards//mean": 0.7391357421875, + "rewards//std": 0.03905920684337616, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2334, + "grad_norm": 0.9611997604370117, + "kl": 0.6028448455035686, + "learning_rate": 8.797388904854063e-07, + "loss": 0.0603, + "num_tokens": 10087896.0, + "reward": 0.7562255859375, + "reward_std": 0.0015408683102577925, + "rewards//mean": 0.7562255859375, + "rewards//std": 0.025188006460666656, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2336, + "grad_norm": 4.824442386627197, + "kl": 1.263984639197588, + "learning_rate": 8.795323786201745e-07, + "loss": 0.1264, + "num_tokens": 10096480.0, + "reward": 0.77398681640625, + "reward_std": 0.016778945922851562, + "rewards//mean": 0.77398681640625, + "rewards//std": 0.03552883118391037, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2338, + "grad_norm": 3.289353847503662, + "kl": 0.8853769600391388, + "learning_rate": 8.79325713879346e-07, + "loss": 0.0885, + "num_tokens": 10105048.0, + "reward": 0.791259765625, + "reward_std": 0.00915705505758524, + "rewards//mean": 0.791259765625, + "rewards//std": 0.028055289760231972, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.234, + "grad_norm": 4.047274589538574, + "kl": 1.974445316940546, + "learning_rate": 8.791188963461652e-07, + "loss": 0.1974, + "num_tokens": 10113640.0, + "reward": 0.7313232421875, + "reward_std": 0.011279085651040077, + "rewards//mean": 0.7313232421875, + "rewards//std": 0.03313444182276726, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2342, + "grad_norm": 2.8111186027526855, + "kl": 0.6526630073785782, + "learning_rate": 8.789119261039384e-07, + "loss": 0.0653, + "num_tokens": 10122320.0, + "reward": 0.779052734375, + "reward_std": 0.005872816778719425, + "rewards//mean": 0.779052734375, + "rewards//std": 0.02503519132733345, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2344, + "grad_norm": 1.8342902660369873, + "kl": 1.5613654907792807, + "learning_rate": 8.78704803236033e-07, + "loss": 0.1561, + "num_tokens": 10131064.0, + "reward": 0.7276611328125, + "reward_std": 0.0104688024148345, + "rewards//mean": 0.7276611328125, + "rewards//std": 0.021959304809570312, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2346, + "grad_norm": 3.295234441757202, + "kl": 1.537804240360856, + "learning_rate": 8.784975278258782e-07, + "loss": 0.1538, + "num_tokens": 10139696.0, + "reward": 0.726806640625, + "reward_std": 0.010460578836500645, + "rewards//mean": 0.726806640625, + "rewards//std": 0.03383180499076843, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2348, + "grad_norm": 3.2483508586883545, + "kl": 0.9632603842765093, + "learning_rate": 8.782900999569645e-07, + "loss": 0.0963, + "num_tokens": 10148320.0, + "reward": 0.75445556640625, + "reward_std": 0.007059913594275713, + "rewards//mean": 0.75445556640625, + "rewards//std": 0.029255446046590805, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.235, + "grad_norm": 4.360206604003906, + "kl": 1.2281594015657902, + "learning_rate": 8.780825197128437e-07, + "loss": 0.1228, + "num_tokens": 10156992.0, + "reward": 0.7880859375, + "reward_std": 0.008820902556180954, + "rewards//mean": 0.7880859375, + "rewards//std": 0.03365325182676315, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2352, + "grad_norm": 2.771652936935425, + "kl": 0.9551923777908087, + "learning_rate": 8.778747871771291e-07, + "loss": 0.0955, + "num_tokens": 10165576.0, + "reward": 0.76788330078125, + "reward_std": 0.00427279295399785, + "rewards//mean": 0.76788330078125, + "rewards//std": 0.015069565735757351, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2354, + "grad_norm": 11.196528434753418, + "kl": 1.2639826629310846, + "learning_rate": 8.776669024334955e-07, + "loss": 0.1264, + "num_tokens": 10174256.0, + "reward": 0.75341796875, + "reward_std": 0.00726303830742836, + "rewards//mean": 0.75341796875, + "rewards//std": 0.02312016673386097, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2356, + "grad_norm": 8.246557235717773, + "kl": 1.1232012081891298, + "learning_rate": 8.774588655656787e-07, + "loss": 0.1123, + "num_tokens": 10182920.0, + "reward": 0.731201171875, + "reward_std": 0.005265967454761267, + "rewards//mean": 0.731201171875, + "rewards//std": 0.040372833609580994, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2358, + "grad_norm": 3.1767334938049316, + "kl": 1.5803054478019476, + "learning_rate": 8.772506766574761e-07, + "loss": 0.158, + "num_tokens": 10191600.0, + "reward": 0.7603759765625, + "reward_std": 0.012790179811418056, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.03148334100842476, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.236, + "grad_norm": 3.4187545776367188, + "kl": 1.6482271905988455, + "learning_rate": 8.770423357927462e-07, + "loss": 0.1648, + "num_tokens": 10200344.0, + "reward": 0.73138427734375, + "reward_std": 0.008016904816031456, + "rewards//mean": 0.73138427734375, + "rewards//std": 0.02673683688044548, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2362, + "grad_norm": 5.574641227722168, + "kl": 2.4331605937331915, + "learning_rate": 8.768338430554082e-07, + "loss": 0.2433, + "num_tokens": 10208960.0, + "reward": 0.7445068359375, + "reward_std": 0.01510899793356657, + "rewards//mean": 0.7445068359375, + "rewards//std": 0.04163595661520958, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2364, + "grad_norm": 3.7420549392700195, + "kl": 2.0698746386915445, + "learning_rate": 8.766251985294434e-07, + "loss": 0.207, + "num_tokens": 10217648.0, + "reward": 0.75933837890625, + "reward_std": 0.013081587851047516, + "rewards//mean": 0.75933837890625, + "rewards//std": 0.03291551023721695, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2366, + "grad_norm": 2.152587890625, + "kl": 1.5023798700422049, + "learning_rate": 8.764164022988937e-07, + "loss": 0.1502, + "num_tokens": 10226272.0, + "reward": 0.7518310546875, + "reward_std": 0.009207741357386112, + "rewards//mean": 0.7518310546875, + "rewards//std": 0.028125077486038208, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2368, + "grad_norm": 2.536895751953125, + "kl": 1.8943487722426653, + "learning_rate": 8.762074544478621e-07, + "loss": 0.1894, + "num_tokens": 10234856.0, + "reward": 0.74957275390625, + "reward_std": 0.011271432042121887, + "rewards//mean": 0.74957275390625, + "rewards//std": 0.029635872691869736, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.237, + "grad_norm": 6.344274520874023, + "kl": 1.4856875035911798, + "learning_rate": 8.75998355060513e-07, + "loss": 0.1486, + "num_tokens": 10243440.0, + "reward": 0.73614501953125, + "reward_std": 0.008308660238981247, + "rewards//mean": 0.73614501953125, + "rewards//std": 0.03561776876449585, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2372, + "grad_norm": 2.1593058109283447, + "kl": 2.0834833960980177, + "learning_rate": 8.757891042210712e-07, + "loss": 0.2083, + "num_tokens": 10252096.0, + "reward": 0.76531982421875, + "reward_std": 0.014573352411389351, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.03263237699866295, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2374, + "grad_norm": 2.0074822902679443, + "kl": 1.9211790971457958, + "learning_rate": 8.755797020138234e-07, + "loss": 0.1921, + "num_tokens": 10260736.0, + "reward": 0.74188232421875, + "reward_std": 0.012608667835593224, + "rewards//mean": 0.74188232421875, + "rewards//std": 0.029588330537080765, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2376, + "grad_norm": 2.497415065765381, + "kl": 1.0434564761817455, + "learning_rate": 8.753701485231164e-07, + "loss": 0.1043, + "num_tokens": 10269320.0, + "reward": 0.74847412109375, + "reward_std": 0.007842149585485458, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.034475311636924744, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2378, + "grad_norm": 3.1493000984191895, + "kl": 1.3220611158758402, + "learning_rate": 8.751604438333586e-07, + "loss": 0.1322, + "num_tokens": 10278008.0, + "reward": 0.766357421875, + "reward_std": 0.010030495002865791, + "rewards//mean": 0.766357421875, + "rewards//std": 0.024733463302254677, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.238, + "grad_norm": 2.0060431957244873, + "kl": 1.1081401947885752, + "learning_rate": 8.749505880290188e-07, + "loss": 0.1108, + "num_tokens": 10286632.0, + "reward": 0.76080322265625, + "reward_std": 0.007352760061621666, + "rewards//mean": 0.76080322265625, + "rewards//std": 0.0350336991250515, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2382, + "grad_norm": 4.355103969573975, + "kl": 1.3405583892017603, + "learning_rate": 8.74740581194627e-07, + "loss": 0.1341, + "num_tokens": 10295192.0, + "reward": 0.74468994140625, + "reward_std": 0.01251955982297659, + "rewards//mean": 0.74468994140625, + "rewards//std": 0.025338031351566315, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2384, + "grad_norm": 3.6300463676452637, + "kl": 2.07859293743968, + "learning_rate": 8.745304234147739e-07, + "loss": 0.2079, + "num_tokens": 10303872.0, + "reward": 0.7431640625, + "reward_std": 0.012090795673429966, + "rewards//mean": 0.7431640625, + "rewards//std": 0.03787250071763992, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2386, + "grad_norm": 3.209120988845825, + "kl": 0.951006256043911, + "learning_rate": 8.743201147741111e-07, + "loss": 0.0951, + "num_tokens": 10312528.0, + "reward": 0.76507568359375, + "reward_std": 0.008098564110696316, + "rewards//mean": 0.76507568359375, + "rewards//std": 0.025189433246850967, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2388, + "grad_norm": 2.1020562648773193, + "kl": 0.973062552511692, + "learning_rate": 8.741096553573506e-07, + "loss": 0.0973, + "num_tokens": 10321160.0, + "reward": 0.7633056640625, + "reward_std": 0.005382574163377285, + "rewards//mean": 0.7633056640625, + "rewards//std": 0.03894898667931557, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.239, + "grad_norm": 1.7738032341003418, + "kl": 1.6766906436532736, + "learning_rate": 8.73899045249266e-07, + "loss": 0.1677, + "num_tokens": 10329808.0, + "reward": 0.78106689453125, + "reward_std": 0.01804327219724655, + "rewards//mean": 0.78106689453125, + "rewards//std": 0.02997160144150257, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2392, + "grad_norm": 7.119793891906738, + "kl": 1.8139428477734327, + "learning_rate": 8.736882845346905e-07, + "loss": 0.1814, + "num_tokens": 10338520.0, + "reward": 0.72161865234375, + "reward_std": 0.004580400418490171, + "rewards//mean": 0.72161865234375, + "rewards//std": 0.023481298238039017, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2394, + "grad_norm": 3.3124215602874756, + "kl": 1.0021106284111738, + "learning_rate": 8.734773732985185e-07, + "loss": 0.1002, + "num_tokens": 10347160.0, + "reward": 0.74310302734375, + "reward_std": 0.006465718150138855, + "rewards//mean": 0.74310302734375, + "rewards//std": 0.03422942012548447, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2396, + "grad_norm": 2.3718671798706055, + "kl": 1.407300479710102, + "learning_rate": 8.732663116257055e-07, + "loss": 0.1407, + "num_tokens": 10355760.0, + "reward": 0.7791748046875, + "reward_std": 0.00996287353336811, + "rewards//mean": 0.7791748046875, + "rewards//std": 0.03363142907619476, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2398, + "grad_norm": 4.85661506652832, + "kl": 2.3818425964564085, + "learning_rate": 8.730550996012667e-07, + "loss": 0.2382, + "num_tokens": 10364384.0, + "reward": 0.74798583984375, + "reward_std": 0.016918540000915527, + "rewards//mean": 0.74798583984375, + "rewards//std": 0.027125800028443336, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.24, + "grad_norm": 2.8339829444885254, + "kl": 1.1268452275544405, + "learning_rate": 8.728437373102784e-07, + "loss": 0.1127, + "num_tokens": 10372912.0, + "reward": 0.77685546875, + "reward_std": 0.009339243173599243, + "rewards//mean": 0.77685546875, + "rewards//std": 0.02476893737912178, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2402, + "grad_norm": 2.791949987411499, + "kl": 1.2671589367091656, + "learning_rate": 8.726322248378774e-07, + "loss": 0.1267, + "num_tokens": 10381504.0, + "reward": 0.7745361328125, + "reward_std": 0.00692584365606308, + "rewards//mean": 0.7745361328125, + "rewards//std": 0.021389398723840714, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2404, + "grad_norm": 2.6466314792633057, + "kl": 1.5692084152251482, + "learning_rate": 8.724205622692606e-07, + "loss": 0.1569, + "num_tokens": 10390168.0, + "reward": 0.7591552734375, + "reward_std": 0.010664994828402996, + "rewards//mean": 0.7591552734375, + "rewards//std": 0.027902444824576378, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2406, + "grad_norm": 3.629345417022705, + "kl": 1.674876980483532, + "learning_rate": 8.72208749689686e-07, + "loss": 0.1675, + "num_tokens": 10398736.0, + "reward": 0.7613525390625, + "reward_std": 0.012000908143818378, + "rewards//mean": 0.7613525390625, + "rewards//std": 0.02549385465681553, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2408, + "grad_norm": 7.045103073120117, + "kl": 1.3719742316752672, + "learning_rate": 8.719967871844715e-07, + "loss": 0.1372, + "num_tokens": 10407320.0, + "reward": 0.6990966796875, + "reward_std": 0.007162667810916901, + "rewards//mean": 0.6990966796875, + "rewards//std": 0.04391361027956009, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.241, + "grad_norm": 2.055586338043213, + "kl": 1.0216310992836952, + "learning_rate": 8.717846748389955e-07, + "loss": 0.1022, + "num_tokens": 10415912.0, + "reward": 0.72021484375, + "reward_std": 0.005646876059472561, + "rewards//mean": 0.72021484375, + "rewards//std": 0.03211009502410889, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2412, + "grad_norm": 3.369438409805298, + "kl": 1.2664784640073776, + "learning_rate": 8.71572412738697e-07, + "loss": 0.1266, + "num_tokens": 10424592.0, + "reward": 0.75750732421875, + "reward_std": 0.007272562012076378, + "rewards//mean": 0.75750732421875, + "rewards//std": 0.028316771611571312, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2414, + "grad_norm": 2.284891128540039, + "kl": 1.4312961101531982, + "learning_rate": 8.713600009690751e-07, + "loss": 0.1431, + "num_tokens": 10433208.0, + "reward": 0.75244140625, + "reward_std": 0.008055581711232662, + "rewards//mean": 0.75244140625, + "rewards//std": 0.036594267934560776, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2416, + "grad_norm": 6.646371841430664, + "kl": 1.6266127917915583, + "learning_rate": 8.711474396156892e-07, + "loss": 0.1627, + "num_tokens": 10441728.0, + "reward": 0.74932861328125, + "reward_std": 0.014531968161463737, + "rewards//mean": 0.74932861328125, + "rewards//std": 0.030989699065685272, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2418, + "grad_norm": 2.9526915550231934, + "kl": 1.5980976950377226, + "learning_rate": 8.709347287641592e-07, + "loss": 0.1598, + "num_tokens": 10450336.0, + "reward": 0.75726318359375, + "reward_std": 0.012622412294149399, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.033803097903728485, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.242, + "grad_norm": 4.072620868682861, + "kl": 1.4195548593997955, + "learning_rate": 8.707218685001646e-07, + "loss": 0.142, + "num_tokens": 10458856.0, + "reward": 0.7459716796875, + "reward_std": 0.008270646445453167, + "rewards//mean": 0.7459716796875, + "rewards//std": 0.03224910423159599, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2422, + "grad_norm": 0.8886381387710571, + "kl": 0.87002008035779, + "learning_rate": 8.705088589094458e-07, + "loss": 0.087, + "num_tokens": 10467424.0, + "reward": 0.72772216796875, + "reward_std": 0.004259682726114988, + "rewards//mean": 0.72772216796875, + "rewards//std": 0.033208996057510376, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2424, + "grad_norm": 3.159609317779541, + "kl": 1.124261612072587, + "learning_rate": 8.702957000778029e-07, + "loss": 0.1124, + "num_tokens": 10476160.0, + "reward": 0.773681640625, + "reward_std": 0.007289689499884844, + "rewards//mean": 0.773681640625, + "rewards//std": 0.02921408787369728, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2426, + "grad_norm": 3.1278998851776123, + "kl": 1.4958477690815926, + "learning_rate": 8.700823920910963e-07, + "loss": 0.1496, + "num_tokens": 10484800.0, + "reward": 0.74505615234375, + "reward_std": 0.012510336935520172, + "rewards//mean": 0.74505615234375, + "rewards//std": 0.028900403529405594, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2428, + "grad_norm": 2.582415819168091, + "kl": 1.4311014134436846, + "learning_rate": 8.698689350352464e-07, + "loss": 0.1431, + "num_tokens": 10493448.0, + "reward": 0.7889404296875, + "reward_std": 0.007610922213643789, + "rewards//mean": 0.7889404296875, + "rewards//std": 0.018649086356163025, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.243, + "grad_norm": 3.446004629135132, + "kl": 1.5160708278417587, + "learning_rate": 8.696553289962337e-07, + "loss": 0.1516, + "num_tokens": 10502088.0, + "reward": 0.77960205078125, + "reward_std": 0.010885559022426605, + "rewards//mean": 0.77960205078125, + "rewards//std": 0.021696729585528374, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2432, + "grad_norm": 2.9214131832122803, + "kl": 1.4365063477307558, + "learning_rate": 8.694415740600988e-07, + "loss": 0.1437, + "num_tokens": 10510608.0, + "reward": 0.7723388671875, + "reward_std": 0.009778087958693504, + "rewards//mean": 0.7723388671875, + "rewards//std": 0.02492459863424301, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2434, + "grad_norm": 1.2435612678527832, + "kl": 0.8110390044748783, + "learning_rate": 8.69227670312942e-07, + "loss": 0.0811, + "num_tokens": 10519232.0, + "reward": 0.7650146484375, + "reward_std": 0.005041959695518017, + "rewards//mean": 0.7650146484375, + "rewards//std": 0.020870720967650414, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2436, + "grad_norm": 0.4521288275718689, + "kl": 0.43685695715248585, + "learning_rate": 8.690136178409235e-07, + "loss": 0.0437, + "num_tokens": 10527872.0, + "reward": 0.767578125, + "reward_std": 0.0016451351111754775, + "rewards//mean": 0.767578125, + "rewards//std": 0.02483241632580757, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2438, + "grad_norm": 1.9898988008499146, + "kl": 1.0086859688162804, + "learning_rate": 8.687994167302641e-07, + "loss": 0.1009, + "num_tokens": 10536560.0, + "reward": 0.71978759765625, + "reward_std": 0.004595073405653238, + "rewards//mean": 0.71978759765625, + "rewards//std": 0.024401891976594925, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.244, + "grad_norm": 3.703798294067383, + "kl": 0.8747746162116528, + "learning_rate": 8.685850670672438e-07, + "loss": 0.0875, + "num_tokens": 10545168.0, + "reward": 0.7659912109375, + "reward_std": 0.008853060193359852, + "rewards//mean": 0.7659912109375, + "rewards//std": 0.030550191178917885, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2442, + "grad_norm": 2.477851152420044, + "kl": 1.1440365593880415, + "learning_rate": 8.683705689382024e-07, + "loss": 0.1144, + "num_tokens": 10553912.0, + "reward": 0.73016357421875, + "reward_std": 0.00880036223679781, + "rewards//mean": 0.73016357421875, + "rewards//std": 0.017473505809903145, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2444, + "grad_norm": 4.674661159515381, + "kl": 0.7348966244608164, + "learning_rate": 8.6815592242954e-07, + "loss": 0.0735, + "num_tokens": 10562520.0, + "reward": 0.72906494140625, + "reward_std": 0.010388755239546299, + "rewards//mean": 0.72906494140625, + "rewards//std": 0.04088842123746872, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2446, + "grad_norm": 5.477402210235596, + "kl": 0.835672477260232, + "learning_rate": 8.67941127627716e-07, + "loss": 0.0836, + "num_tokens": 10571288.0, + "reward": 0.78704833984375, + "reward_std": 0.011899751611053944, + "rewards//mean": 0.78704833984375, + "rewards//std": 0.027874423190951347, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2448, + "grad_norm": 6.605255603790283, + "kl": 0.9427262730896473, + "learning_rate": 8.677261846192499e-07, + "loss": 0.0943, + "num_tokens": 10579888.0, + "reward": 0.7578125, + "reward_std": 0.009779705666005611, + "rewards//mean": 0.7578125, + "rewards//std": 0.027909226715564728, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.245, + "grad_norm": 2.804797887802124, + "kl": 1.780638962984085, + "learning_rate": 8.675110934907204e-07, + "loss": 0.1781, + "num_tokens": 10588520.0, + "reward": 0.77777099609375, + "reward_std": 0.010743267834186554, + "rewards//mean": 0.77777099609375, + "rewards//std": 0.024823803454637527, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2452, + "grad_norm": 1.7409158945083618, + "kl": 1.267751183360815, + "learning_rate": 8.672958543287666e-07, + "loss": 0.1268, + "num_tokens": 10597080.0, + "reward": 0.77142333984375, + "reward_std": 0.010439383797347546, + "rewards//mean": 0.77142333984375, + "rewards//std": 0.02546793781220913, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2454, + "grad_norm": 1.2458970546722412, + "kl": 1.0435888636857271, + "learning_rate": 8.670804672200865e-07, + "loss": 0.1044, + "num_tokens": 10605704.0, + "reward": 0.7398681640625, + "reward_std": 0.005552348215132952, + "rewards//mean": 0.7398681640625, + "rewards//std": 0.025635965168476105, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2456, + "grad_norm": 3.520935297012329, + "kl": 0.8077416438609362, + "learning_rate": 8.668649322514381e-07, + "loss": 0.0808, + "num_tokens": 10614352.0, + "reward": 0.73052978515625, + "reward_std": 0.006767407990992069, + "rewards//mean": 0.73052978515625, + "rewards//std": 0.02818979136645794, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2458, + "grad_norm": 4.556258201599121, + "kl": 1.0413630101829767, + "learning_rate": 8.666492495096389e-07, + "loss": 0.1041, + "num_tokens": 10622968.0, + "reward": 0.7535400390625, + "reward_std": 0.00897935964167118, + "rewards//mean": 0.7535400390625, + "rewards//std": 0.03483942151069641, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.246, + "grad_norm": 2.738713026046753, + "kl": 1.480254141613841, + "learning_rate": 8.664334190815659e-07, + "loss": 0.148, + "num_tokens": 10631624.0, + "reward": 0.7525634765625, + "reward_std": 0.010982338339090347, + "rewards//mean": 0.7525634765625, + "rewards//std": 0.028273237869143486, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2462, + "grad_norm": 5.985865592956543, + "kl": 2.037445917725563, + "learning_rate": 8.662174410541554e-07, + "loss": 0.2037, + "num_tokens": 10640224.0, + "reward": 0.7392578125, + "reward_std": 0.012627032585442066, + "rewards//mean": 0.7392578125, + "rewards//std": 0.03356677293777466, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2464, + "grad_norm": 3.7179629802703857, + "kl": 1.844133771955967, + "learning_rate": 8.660013155144035e-07, + "loss": 0.1844, + "num_tokens": 10648752.0, + "reward": 0.72821044921875, + "reward_std": 0.016244065016508102, + "rewards//mean": 0.72821044921875, + "rewards//std": 0.03532329574227333, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2466, + "grad_norm": 4.169260501861572, + "kl": 1.7794784530997276, + "learning_rate": 8.657850425493654e-07, + "loss": 0.1779, + "num_tokens": 10657368.0, + "reward": 0.746337890625, + "reward_std": 0.01339347381144762, + "rewards//mean": 0.746337890625, + "rewards//std": 0.039841409772634506, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2468, + "grad_norm": 3.695326328277588, + "kl": 1.2684337049722672, + "learning_rate": 8.65568622246156e-07, + "loss": 0.1268, + "num_tokens": 10666032.0, + "reward": 0.762451171875, + "reward_std": 0.008404484018683434, + "rewards//mean": 0.762451171875, + "rewards//std": 0.027285005897283554, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.247, + "grad_norm": 2.6150856018066406, + "kl": 0.7142418771982193, + "learning_rate": 8.653520546919493e-07, + "loss": 0.0714, + "num_tokens": 10674592.0, + "reward": 0.786865234375, + "reward_std": 0.006299816071987152, + "rewards//mean": 0.786865234375, + "rewards//std": 0.03240573778748512, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2472, + "grad_norm": 2.7588894367218018, + "kl": 1.3413833174854517, + "learning_rate": 8.651353399739787e-07, + "loss": 0.1341, + "num_tokens": 10683232.0, + "reward": 0.74127197265625, + "reward_std": 0.010000904090702534, + "rewards//mean": 0.74127197265625, + "rewards//std": 0.03273935988545418, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2474, + "grad_norm": 6.649838447570801, + "kl": 2.623996540904045, + "learning_rate": 8.649184781795367e-07, + "loss": 0.2624, + "num_tokens": 10691880.0, + "reward": 0.78399658203125, + "reward_std": 0.021707233041524887, + "rewards//mean": 0.78399658203125, + "rewards//std": 0.03591403737664223, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2476, + "grad_norm": 4.778037071228027, + "kl": 1.628692101687193, + "learning_rate": 8.647014693959753e-07, + "loss": 0.1629, + "num_tokens": 10700504.0, + "reward": 0.725830078125, + "reward_std": 0.01012200117111206, + "rewards//mean": 0.725830078125, + "rewards//std": 0.036410968750715256, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2478, + "grad_norm": 3.7553439140319824, + "kl": 1.1641744170337915, + "learning_rate": 8.644843137107057e-07, + "loss": 0.1164, + "num_tokens": 10709176.0, + "reward": 0.77886962890625, + "reward_std": 0.007269697263836861, + "rewards//mean": 0.77886962890625, + "rewards//std": 0.023964976891875267, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.248, + "grad_norm": 1.3512905836105347, + "kl": 1.2719145566225052, + "learning_rate": 8.642670112111981e-07, + "loss": 0.1272, + "num_tokens": 10717912.0, + "reward": 0.76165771484375, + "reward_std": 0.008655503392219543, + "rewards//mean": 0.76165771484375, + "rewards//std": 0.03598688170313835, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2482, + "grad_norm": 1.5786871910095215, + "kl": 0.9103036411106586, + "learning_rate": 8.64049561984982e-07, + "loss": 0.091, + "num_tokens": 10726616.0, + "reward": 0.75714111328125, + "reward_std": 0.004460044205188751, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.020190447568893433, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2484, + "grad_norm": 2.169869899749756, + "kl": 1.040619820356369, + "learning_rate": 8.638319661196459e-07, + "loss": 0.1041, + "num_tokens": 10735200.0, + "reward": 0.76641845703125, + "reward_std": 0.007590742781758308, + "rewards//mean": 0.76641845703125, + "rewards//std": 0.02504781074821949, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2486, + "grad_norm": 10.328278541564941, + "kl": 2.630313467234373, + "learning_rate": 8.636142237028372e-07, + "loss": 0.263, + "num_tokens": 10743880.0, + "reward": 0.73651123046875, + "reward_std": 0.01347922533750534, + "rewards//mean": 0.73651123046875, + "rewards//std": 0.03510405868291855, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2488, + "grad_norm": 7.640003681182861, + "kl": 2.4689796324819326, + "learning_rate": 8.633963348222628e-07, + "loss": 0.2469, + "num_tokens": 10752560.0, + "reward": 0.74615478515625, + "reward_std": 0.010604561306536198, + "rewards//mean": 0.74615478515625, + "rewards//std": 0.036508072167634964, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.249, + "grad_norm": 4.208217620849609, + "kl": 1.5706639103591442, + "learning_rate": 8.631782995656882e-07, + "loss": 0.1571, + "num_tokens": 10761192.0, + "reward": 0.75830078125, + "reward_std": 0.0143938809633255, + "rewards//mean": 0.75830078125, + "rewards//std": 0.03016548976302147, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2492, + "grad_norm": 2.6180520057678223, + "kl": 1.5482365787029266, + "learning_rate": 8.62960118020938e-07, + "loss": 0.1548, + "num_tokens": 10769904.0, + "reward": 0.71759033203125, + "reward_std": 0.010162541642785072, + "rewards//mean": 0.71759033203125, + "rewards//std": 0.04660388082265854, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2494, + "grad_norm": 2.4136457443237305, + "kl": 1.4631451219320297, + "learning_rate": 8.627417902758956e-07, + "loss": 0.1463, + "num_tokens": 10778632.0, + "reward": 0.7861328125, + "reward_std": 0.014890835620462894, + "rewards//mean": 0.7861328125, + "rewards//std": 0.02900712378323078, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2496, + "grad_norm": 1.8261510133743286, + "kl": 1.0705778319388628, + "learning_rate": 8.625233164185034e-07, + "loss": 0.1071, + "num_tokens": 10787216.0, + "reward": 0.751708984375, + "reward_std": 0.005420691333711147, + "rewards//mean": 0.751708984375, + "rewards//std": 0.02442800998687744, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2498, + "grad_norm": 1.898107647895813, + "kl": 0.8983949609100819, + "learning_rate": 8.623046965367628e-07, + "loss": 0.0898, + "num_tokens": 10795792.0, + "reward": 0.7265625, + "reward_std": 0.003153369063511491, + "rewards//mean": 0.7265625, + "rewards//std": 0.021156350150704384, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.25, + "grad_norm": 3.469754934310913, + "kl": 2.335224675014615, + "learning_rate": 8.620859307187338e-07, + "loss": 0.2335, + "num_tokens": 10804400.0, + "reward": 0.74237060546875, + "reward_std": 0.012575688771903515, + "rewards//mean": 0.74237060546875, + "rewards//std": 0.02802553027868271, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2502, + "grad_norm": 3.4656877517700195, + "kl": 1.8427002094686031, + "learning_rate": 8.61867019052535e-07, + "loss": 0.1843, + "num_tokens": 10813208.0, + "reward": 0.75823974609375, + "reward_std": 0.011305361986160278, + "rewards//mean": 0.75823974609375, + "rewards//std": 0.03339400514960289, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2504, + "grad_norm": 2.106476068496704, + "kl": 2.1087175458669662, + "learning_rate": 8.616479616263444e-07, + "loss": 0.2109, + "num_tokens": 10821848.0, + "reward": 0.75262451171875, + "reward_std": 0.013109234161674976, + "rewards//mean": 0.75262451171875, + "rewards//std": 0.03394743800163269, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2506, + "grad_norm": 2.731506586074829, + "kl": 0.6706829220056534, + "learning_rate": 8.61428758528398e-07, + "loss": 0.0671, + "num_tokens": 10830376.0, + "reward": 0.7547607421875, + "reward_std": 0.004223778378218412, + "rewards//mean": 0.7547607421875, + "rewards//std": 0.01751730777323246, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2508, + "grad_norm": 2.818578004837036, + "kl": 1.552479200065136, + "learning_rate": 8.612094098469909e-07, + "loss": 0.1552, + "num_tokens": 10838992.0, + "reward": 0.77081298828125, + "reward_std": 0.013558020815253258, + "rewards//mean": 0.77081298828125, + "rewards//std": 0.029463233426213264, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.251, + "grad_norm": 6.754956245422363, + "kl": 1.914738615974784, + "learning_rate": 8.609899156704767e-07, + "loss": 0.1915, + "num_tokens": 10847728.0, + "reward": 0.7509765625, + "reward_std": 0.011311469599604607, + "rewards//mean": 0.7509765625, + "rewards//std": 0.03421001136302948, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2512, + "grad_norm": 5.389358043670654, + "kl": 2.6926444843411446, + "learning_rate": 8.607702760872677e-07, + "loss": 0.2693, + "num_tokens": 10856544.0, + "reward": 0.776611328125, + "reward_std": 0.01727277785539627, + "rewards//mean": 0.776611328125, + "rewards//std": 0.05261942744255066, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2514, + "grad_norm": 2.313718557357788, + "kl": 1.280439605936408, + "learning_rate": 8.605504911858346e-07, + "loss": 0.128, + "num_tokens": 10865104.0, + "reward": 0.7525634765625, + "reward_std": 0.006848743185400963, + "rewards//mean": 0.7525634765625, + "rewards//std": 0.02454022504389286, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2516, + "grad_norm": 1.7476269006729126, + "kl": 1.4869457203894854, + "learning_rate": 8.603305610547069e-07, + "loss": 0.1487, + "num_tokens": 10873816.0, + "reward": 0.7633056640625, + "reward_std": 0.010295258834958076, + "rewards//mean": 0.7633056640625, + "rewards//std": 0.04068861901760101, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2518, + "grad_norm": 2.303586721420288, + "kl": 1.176751121878624, + "learning_rate": 8.601104857824722e-07, + "loss": 0.1177, + "num_tokens": 10882440.0, + "reward": 0.79437255859375, + "reward_std": 0.006195859983563423, + "rewards//mean": 0.79437255859375, + "rewards//std": 0.030515672639012337, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.252, + "grad_norm": 2.7836127281188965, + "kl": 1.0767667312175035, + "learning_rate": 8.598902654577768e-07, + "loss": 0.1077, + "num_tokens": 10891048.0, + "reward": 0.7734375, + "reward_std": 0.01202109083533287, + "rewards//mean": 0.7734375, + "rewards//std": 0.0266120582818985, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2522, + "grad_norm": 3.5508615970611572, + "kl": 1.8393850270658731, + "learning_rate": 8.596699001693255e-07, + "loss": 0.1839, + "num_tokens": 10899720.0, + "reward": 0.750244140625, + "reward_std": 0.015253997407853603, + "rewards//mean": 0.750244140625, + "rewards//std": 0.034610338509082794, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2524, + "grad_norm": 5.652132034301758, + "kl": 1.8502464350312948, + "learning_rate": 8.594493900058816e-07, + "loss": 0.185, + "num_tokens": 10908304.0, + "reward": 0.766845703125, + "reward_std": 0.01523201446980238, + "rewards//mean": 0.766845703125, + "rewards//std": 0.052180320024490356, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2526, + "grad_norm": 3.7884795665740967, + "kl": 1.8132641948759556, + "learning_rate": 8.592287350562663e-07, + "loss": 0.1813, + "num_tokens": 10916880.0, + "reward": 0.782958984375, + "reward_std": 0.010780639946460724, + "rewards//mean": 0.782958984375, + "rewards//std": 0.033326905220746994, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2528, + "grad_norm": 3.681049346923828, + "kl": 0.9148434549570084, + "learning_rate": 8.590079354093593e-07, + "loss": 0.0915, + "num_tokens": 10925520.0, + "reward": 0.76666259765625, + "reward_std": 0.0037816944532096386, + "rewards//mean": 0.76666259765625, + "rewards//std": 0.019981687888503075, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.253, + "grad_norm": 0.6351627707481384, + "kl": 0.7859466709196568, + "learning_rate": 8.587869911540992e-07, + "loss": 0.0786, + "num_tokens": 10934168.0, + "reward": 0.75091552734375, + "reward_std": 0.00666106166318059, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.02945244126021862, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2532, + "grad_norm": 1.621062159538269, + "kl": 2.01824108697474, + "learning_rate": 8.585659023794818e-07, + "loss": 0.2018, + "num_tokens": 10942832.0, + "reward": 0.745361328125, + "reward_std": 0.016542179509997368, + "rewards//mean": 0.745361328125, + "rewards//std": 0.036939289420843124, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2534, + "grad_norm": 5.1790771484375, + "kl": 0.8341042678803205, + "learning_rate": 8.583446691745617e-07, + "loss": 0.0834, + "num_tokens": 10951568.0, + "reward": 0.7872314453125, + "reward_std": 0.008905846625566483, + "rewards//mean": 0.7872314453125, + "rewards//std": 0.020646117627620697, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2536, + "grad_norm": 2.450307607650757, + "kl": 0.867433724924922, + "learning_rate": 8.581232916284517e-07, + "loss": 0.0867, + "num_tokens": 10960248.0, + "reward": 0.75677490234375, + "reward_std": 0.0065104844979941845, + "rewards//mean": 0.75677490234375, + "rewards//std": 0.02819623425602913, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2538, + "grad_norm": 2.024763584136963, + "kl": 1.301382478326559, + "learning_rate": 8.579017698303228e-07, + "loss": 0.1301, + "num_tokens": 10968904.0, + "reward": 0.75848388671875, + "reward_std": 0.008169629611074924, + "rewards//mean": 0.75848388671875, + "rewards//std": 0.03370039165019989, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.254, + "grad_norm": 4.69452428817749, + "kl": 1.8262019213289022, + "learning_rate": 8.576801038694039e-07, + "loss": 0.1826, + "num_tokens": 10977560.0, + "reward": 0.76617431640625, + "reward_std": 0.013292655348777771, + "rewards//mean": 0.76617431640625, + "rewards//std": 0.03792916610836983, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2542, + "grad_norm": 1.103826642036438, + "kl": 0.6330838054418564, + "learning_rate": 8.574582938349817e-07, + "loss": 0.0633, + "num_tokens": 10986264.0, + "reward": 0.767578125, + "reward_std": 0.0022538788616657257, + "rewards//mean": 0.767578125, + "rewards//std": 0.016977090388536453, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2544, + "grad_norm": 1.8948676586151123, + "kl": 0.7893429528921843, + "learning_rate": 8.572363398164016e-07, + "loss": 0.0789, + "num_tokens": 10994872.0, + "reward": 0.76055908203125, + "reward_std": 0.003600158728659153, + "rewards//mean": 0.76055908203125, + "rewards//std": 0.023915022611618042, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2546, + "grad_norm": 4.184702396392822, + "kl": 1.00759750418365, + "learning_rate": 8.570142419030666e-07, + "loss": 0.1008, + "num_tokens": 11003560.0, + "reward": 0.76800537109375, + "reward_std": 0.008670274168252945, + "rewards//mean": 0.76800537109375, + "rewards//std": 0.030358511954545975, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2548, + "grad_norm": 2.8754255771636963, + "kl": 1.1997501738369465, + "learning_rate": 8.567920001844375e-07, + "loss": 0.12, + "num_tokens": 11012160.0, + "reward": 0.7510986328125, + "reward_std": 0.010766448453068733, + "rewards//mean": 0.7510986328125, + "rewards//std": 0.031791478395462036, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.255, + "grad_norm": 9.786314964294434, + "kl": 2.5837177895009518, + "learning_rate": 8.565696147500337e-07, + "loss": 0.2584, + "num_tokens": 11020816.0, + "reward": 0.74859619140625, + "reward_std": 0.017663855105638504, + "rewards//mean": 0.74859619140625, + "rewards//std": 0.041773296892642975, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2552, + "grad_norm": 2.8679399490356445, + "kl": 1.023074833676219, + "learning_rate": 8.563470856894314e-07, + "loss": 0.1023, + "num_tokens": 11029440.0, + "reward": 0.76708984375, + "reward_std": 0.008393588475883007, + "rewards//mean": 0.76708984375, + "rewards//std": 0.01991196535527706, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2554, + "grad_norm": 4.652540683746338, + "kl": 0.9408929571509361, + "learning_rate": 8.561244130922657e-07, + "loss": 0.0941, + "num_tokens": 11038104.0, + "reward": 0.7435302734375, + "reward_std": 0.004335008095949888, + "rewards//mean": 0.7435302734375, + "rewards//std": 0.03208533674478531, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2556, + "grad_norm": 6.747488975524902, + "kl": 2.049085784703493, + "learning_rate": 8.559015970482291e-07, + "loss": 0.2049, + "num_tokens": 11046760.0, + "reward": 0.74151611328125, + "reward_std": 0.010567471385002136, + "rewards//mean": 0.74151611328125, + "rewards//std": 0.044893182814121246, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2558, + "grad_norm": 3.9553515911102295, + "kl": 1.5856800060719252, + "learning_rate": 8.556786376470716e-07, + "loss": 0.1586, + "num_tokens": 11055496.0, + "reward": 0.77276611328125, + "reward_std": 0.0153457997366786, + "rewards//mean": 0.77276611328125, + "rewards//std": 0.03472686558961868, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.256, + "grad_norm": 3.669755220413208, + "kl": 1.9961835369467735, + "learning_rate": 8.554555349786015e-07, + "loss": 0.1996, + "num_tokens": 11064112.0, + "reward": 0.79107666015625, + "reward_std": 0.01941429078578949, + "rewards//mean": 0.79107666015625, + "rewards//std": 0.034209955483675, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2562, + "grad_norm": 3.585503101348877, + "kl": 1.9079175908118486, + "learning_rate": 8.552322891326844e-07, + "loss": 0.1908, + "num_tokens": 11072752.0, + "reward": 0.76318359375, + "reward_std": 0.011368101462721825, + "rewards//mean": 0.76318359375, + "rewards//std": 0.017723919823765755, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2564, + "grad_norm": 3.162195920944214, + "kl": 1.5276030581444502, + "learning_rate": 8.550089001992437e-07, + "loss": 0.1528, + "num_tokens": 11081424.0, + "reward": 0.74066162109375, + "reward_std": 0.010886474512517452, + "rewards//mean": 0.74066162109375, + "rewards//std": 0.041826896369457245, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2566, + "grad_norm": 4.8460798263549805, + "kl": 1.4931025095283985, + "learning_rate": 8.547853682682604e-07, + "loss": 0.1493, + "num_tokens": 11089952.0, + "reward": 0.77691650390625, + "reward_std": 0.010478938929736614, + "rewards//mean": 0.77691650390625, + "rewards//std": 0.024060796946287155, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2568, + "grad_norm": 1.2930303812026978, + "kl": 0.787976648658514, + "learning_rate": 8.545616934297733e-07, + "loss": 0.0788, + "num_tokens": 11098544.0, + "reward": 0.79632568359375, + "reward_std": 0.004147297702729702, + "rewards//mean": 0.79632568359375, + "rewards//std": 0.02091301791369915, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.257, + "grad_norm": 1.154871940612793, + "kl": 1.2250561993569136, + "learning_rate": 8.543378757738784e-07, + "loss": 0.1225, + "num_tokens": 11107168.0, + "reward": 0.7586669921875, + "reward_std": 0.006589522585272789, + "rewards//mean": 0.7586669921875, + "rewards//std": 0.039533793926239014, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2572, + "grad_norm": 1.7693229913711548, + "kl": 1.31552180275321, + "learning_rate": 8.541139153907295e-07, + "loss": 0.1316, + "num_tokens": 11115752.0, + "reward": 0.75714111328125, + "reward_std": 0.007101314142346382, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.022418692708015442, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2574, + "grad_norm": 2.711568593978882, + "kl": 1.3092481419444084, + "learning_rate": 8.538898123705379e-07, + "loss": 0.1309, + "num_tokens": 11124360.0, + "reward": 0.7662353515625, + "reward_std": 0.009426334872841835, + "rewards//mean": 0.7662353515625, + "rewards//std": 0.03132136911153793, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2576, + "grad_norm": 1.8225528001785278, + "kl": 1.1906631663441658, + "learning_rate": 8.536655668035721e-07, + "loss": 0.1191, + "num_tokens": 11133016.0, + "reward": 0.76165771484375, + "reward_std": 0.0082542160525918, + "rewards//mean": 0.76165771484375, + "rewards//std": 0.035791173577308655, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2578, + "grad_norm": 3.3898680210113525, + "kl": 1.274760453030467, + "learning_rate": 8.534411787801586e-07, + "loss": 0.1275, + "num_tokens": 11141656.0, + "reward": 0.7720947265625, + "reward_std": 0.008762829937040806, + "rewards//mean": 0.7720947265625, + "rewards//std": 0.03438809514045715, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.258, + "grad_norm": 5.22819709777832, + "kl": 1.657284589484334, + "learning_rate": 8.532166483906802e-07, + "loss": 0.1657, + "num_tokens": 11150280.0, + "reward": 0.741455078125, + "reward_std": 0.009275168180465698, + "rewards//mean": 0.741455078125, + "rewards//std": 0.03468024730682373, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2582, + "grad_norm": 7.5085835456848145, + "kl": 2.815685471519828, + "learning_rate": 8.529919757255781e-07, + "loss": 0.2816, + "num_tokens": 11158864.0, + "reward": 0.7469482421875, + "reward_std": 0.017527610063552856, + "rewards//mean": 0.7469482421875, + "rewards//std": 0.042543720453977585, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2584, + "grad_norm": 2.104673385620117, + "kl": 1.5815368201583624, + "learning_rate": 8.527671608753506e-07, + "loss": 0.1582, + "num_tokens": 11167488.0, + "reward": 0.76043701171875, + "reward_std": 0.00821172446012497, + "rewards//mean": 0.76043701171875, + "rewards//std": 0.028485199436545372, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2586, + "grad_norm": 4.024179458618164, + "kl": 1.4441726431250572, + "learning_rate": 8.525422039305528e-07, + "loss": 0.1444, + "num_tokens": 11176208.0, + "reward": 0.7763671875, + "reward_std": 0.015655819326639175, + "rewards//mean": 0.7763671875, + "rewards//std": 0.03253347799181938, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2588, + "grad_norm": 2.9449851512908936, + "kl": 1.3701546844094992, + "learning_rate": 8.523171049817973e-07, + "loss": 0.137, + "num_tokens": 11184800.0, + "reward": 0.77728271484375, + "reward_std": 0.015028866939246655, + "rewards//mean": 0.77728271484375, + "rewards//std": 0.03298075124621391, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.259, + "grad_norm": 2.749448776245117, + "kl": 0.8904798720031977, + "learning_rate": 8.520918641197541e-07, + "loss": 0.089, + "num_tokens": 11193456.0, + "reward": 0.7777099609375, + "reward_std": 0.006039226893335581, + "rewards//mean": 0.7777099609375, + "rewards//std": 0.0273013673722744, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2592, + "grad_norm": 2.463488817214966, + "kl": 0.6949389223009348, + "learning_rate": 8.518664814351502e-07, + "loss": 0.0695, + "num_tokens": 11202144.0, + "reward": 0.7506103515625, + "reward_std": 0.004664257634431124, + "rewards//mean": 0.7506103515625, + "rewards//std": 0.028058256953954697, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2594, + "grad_norm": 3.9094603061676025, + "kl": 1.8501922711730003, + "learning_rate": 8.516409570187696e-07, + "loss": 0.185, + "num_tokens": 11210904.0, + "reward": 0.75457763671875, + "reward_std": 0.01569659821689129, + "rewards//mean": 0.75457763671875, + "rewards//std": 0.03982730209827423, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2596, + "grad_norm": 4.6628098487854, + "kl": 1.4436557963490486, + "learning_rate": 8.514152909614535e-07, + "loss": 0.1444, + "num_tokens": 11219616.0, + "reward": 0.75396728515625, + "reward_std": 0.01311071589589119, + "rewards//mean": 0.75396728515625, + "rewards//std": 0.029793795198202133, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2598, + "grad_norm": 3.954066514968872, + "kl": 1.4453269317746162, + "learning_rate": 8.511894833541005e-07, + "loss": 0.1445, + "num_tokens": 11228280.0, + "reward": 0.74578857421875, + "reward_std": 0.007162814028561115, + "rewards//mean": 0.74578857421875, + "rewards//std": 0.0248158760368824, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.26, + "grad_norm": 1.8072354793548584, + "kl": 0.7664232403039932, + "learning_rate": 8.509635342876654e-07, + "loss": 0.0766, + "num_tokens": 11236848.0, + "reward": 0.75201416015625, + "reward_std": 0.0031141305807977915, + "rewards//mean": 0.75201416015625, + "rewards//std": 0.02588300220668316, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2602, + "grad_norm": 5.837655544281006, + "kl": 2.054176390171051, + "learning_rate": 8.507374438531606e-07, + "loss": 0.2054, + "num_tokens": 11245440.0, + "reward": 0.75836181640625, + "reward_std": 0.01429401058703661, + "rewards//mean": 0.75836181640625, + "rewards//std": 0.04007170721888542, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2604, + "grad_norm": 9.02835464477539, + "kl": 1.941506065428257, + "learning_rate": 8.505112121416553e-07, + "loss": 0.1942, + "num_tokens": 11254128.0, + "reward": 0.73370361328125, + "reward_std": 0.010103583335876465, + "rewards//mean": 0.73370361328125, + "rewards//std": 0.024847574532032013, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2606, + "grad_norm": 3.456326961517334, + "kl": 1.1359886340796947, + "learning_rate": 8.502848392442758e-07, + "loss": 0.1136, + "num_tokens": 11262768.0, + "reward": 0.78424072265625, + "reward_std": 0.014487136155366898, + "rewards//mean": 0.78424072265625, + "rewards//std": 0.03456038609147072, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2608, + "grad_norm": 0.8553781509399414, + "kl": 0.812415087595582, + "learning_rate": 8.500583252522052e-07, + "loss": 0.0812, + "num_tokens": 11271360.0, + "reward": 0.76873779296875, + "reward_std": 0.0066340104676783085, + "rewards//mean": 0.76873779296875, + "rewards//std": 0.03270559012889862, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.261, + "grad_norm": 2.3737709522247314, + "kl": 1.4914542753249407, + "learning_rate": 8.498316702566826e-07, + "loss": 0.1491, + "num_tokens": 11279992.0, + "reward": 0.76104736328125, + "reward_std": 0.010477501899003983, + "rewards//mean": 0.76104736328125, + "rewards//std": 0.020417097955942154, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2612, + "grad_norm": 10.065834045410156, + "kl": 1.307991225272417, + "learning_rate": 8.496048743490053e-07, + "loss": 0.1308, + "num_tokens": 11288736.0, + "reward": 0.76806640625, + "reward_std": 0.00657601747661829, + "rewards//mean": 0.76806640625, + "rewards//std": 0.02585030160844326, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2614, + "grad_norm": 2.43906307220459, + "kl": 0.7377323266118765, + "learning_rate": 8.493779376205264e-07, + "loss": 0.0738, + "num_tokens": 11297368.0, + "reward": 0.7821044921875, + "reward_std": 0.006939433049410582, + "rewards//mean": 0.7821044921875, + "rewards//std": 0.027930641546845436, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2616, + "grad_norm": 2.145319938659668, + "kl": 1.0703008230775595, + "learning_rate": 8.491508601626561e-07, + "loss": 0.107, + "num_tokens": 11306032.0, + "reward": 0.7578125, + "reward_std": 0.006287199445068836, + "rewards//mean": 0.7578125, + "rewards//std": 0.03166576102375984, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2618, + "grad_norm": 11.946181297302246, + "kl": 2.5898921005427837, + "learning_rate": 8.489236420668608e-07, + "loss": 0.259, + "num_tokens": 11314848.0, + "reward": 0.74591064453125, + "reward_std": 0.012224599719047546, + "rewards//mean": 0.74591064453125, + "rewards//std": 0.031074577942490578, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.262, + "grad_norm": 1.9521502256393433, + "kl": 1.122292123734951, + "learning_rate": 8.486962834246645e-07, + "loss": 0.1122, + "num_tokens": 11323456.0, + "reward": 0.73980712890625, + "reward_std": 0.0033526255283504725, + "rewards//mean": 0.73980712890625, + "rewards//std": 0.028353635221719742, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2622, + "grad_norm": 4.476296424865723, + "kl": 1.1261905822902918, + "learning_rate": 8.484687843276468e-07, + "loss": 0.1126, + "num_tokens": 11332048.0, + "reward": 0.75634765625, + "reward_std": 0.004982992075383663, + "rewards//mean": 0.75634765625, + "rewards//std": 0.030012547969818115, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2624, + "grad_norm": 2.3960392475128174, + "kl": 1.8680939860641956, + "learning_rate": 8.482411448674445e-07, + "loss": 0.1868, + "num_tokens": 11340712.0, + "reward": 0.7462158203125, + "reward_std": 0.010261274874210358, + "rewards//mean": 0.7462158203125, + "rewards//std": 0.03507671877741814, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2626, + "grad_norm": 6.032341480255127, + "kl": 1.5083320494741201, + "learning_rate": 8.480133651357505e-07, + "loss": 0.1508, + "num_tokens": 11349392.0, + "reward": 0.74969482421875, + "reward_std": 0.007270030677318573, + "rewards//mean": 0.74969482421875, + "rewards//std": 0.030459566041827202, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2628, + "grad_norm": 2.41402268409729, + "kl": 2.0536234378814697, + "learning_rate": 8.477854452243147e-07, + "loss": 0.2054, + "num_tokens": 11358080.0, + "reward": 0.78143310546875, + "reward_std": 0.01560906134545803, + "rewards//mean": 0.78143310546875, + "rewards//std": 0.030140826478600502, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.263, + "grad_norm": 1.914323091506958, + "kl": 1.3961933478713036, + "learning_rate": 8.475573852249434e-07, + "loss": 0.1396, + "num_tokens": 11366696.0, + "reward": 0.74169921875, + "reward_std": 0.006042609456926584, + "rewards//mean": 0.74169921875, + "rewards//std": 0.030221641063690186, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2632, + "grad_norm": 3.4490444660186768, + "kl": 1.9582337886095047, + "learning_rate": 8.473291852294986e-07, + "loss": 0.1958, + "num_tokens": 11375288.0, + "reward": 0.7637939453125, + "reward_std": 0.01188709493726492, + "rewards//mean": 0.7637939453125, + "rewards//std": 0.024596910923719406, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2634, + "grad_norm": 3.589838981628418, + "kl": 1.6921525243669748, + "learning_rate": 8.471008453298996e-07, + "loss": 0.1692, + "num_tokens": 11383904.0, + "reward": 0.775390625, + "reward_std": 0.0128225889056921, + "rewards//mean": 0.775390625, + "rewards//std": 0.03313101455569267, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2636, + "grad_norm": 2.3534553050994873, + "kl": 1.923955136910081, + "learning_rate": 8.468723656181218e-07, + "loss": 0.1924, + "num_tokens": 11392448.0, + "reward": 0.75640869140625, + "reward_std": 0.012626361101865768, + "rewards//mean": 0.75640869140625, + "rewards//std": 0.02944113127887249, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2638, + "grad_norm": 9.595563888549805, + "kl": 1.5738086681813002, + "learning_rate": 8.466437461861964e-07, + "loss": 0.1574, + "num_tokens": 11401088.0, + "reward": 0.773193359375, + "reward_std": 0.01257226150482893, + "rewards//mean": 0.773193359375, + "rewards//std": 0.02903946116566658, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.264, + "grad_norm": 7.869680404663086, + "kl": 1.9713663961738348, + "learning_rate": 8.464149871262116e-07, + "loss": 0.1971, + "num_tokens": 11409840.0, + "reward": 0.747802734375, + "reward_std": 0.007948263548314571, + "rewards//mean": 0.747802734375, + "rewards//std": 0.03971964120864868, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2642, + "grad_norm": 1.781855821609497, + "kl": 1.3266140967607498, + "learning_rate": 8.461860885303113e-07, + "loss": 0.1327, + "num_tokens": 11418512.0, + "reward": 0.7415771484375, + "reward_std": 0.005644769407808781, + "rewards//mean": 0.7415771484375, + "rewards//std": 0.02440165914595127, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2644, + "grad_norm": 2.119027853012085, + "kl": 1.2260348349809647, + "learning_rate": 8.459570504906961e-07, + "loss": 0.1226, + "num_tokens": 11427168.0, + "reward": 0.75335693359375, + "reward_std": 0.011243656277656555, + "rewards//mean": 0.75335693359375, + "rewards//std": 0.023590637370944023, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2646, + "grad_norm": 5.476919651031494, + "kl": 1.813824001699686, + "learning_rate": 8.457278730996222e-07, + "loss": 0.1814, + "num_tokens": 11435776.0, + "reward": 0.79547119140625, + "reward_std": 0.01277064997702837, + "rewards//mean": 0.79547119140625, + "rewards//std": 0.028351498767733574, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2648, + "grad_norm": 4.00071907043457, + "kl": 2.1896361093968153, + "learning_rate": 8.454985564494024e-07, + "loss": 0.219, + "num_tokens": 11444424.0, + "reward": 0.7470703125, + "reward_std": 0.016677251085639, + "rewards//mean": 0.7470703125, + "rewards//std": 0.03972649946808815, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.265, + "grad_norm": 8.978479385375977, + "kl": 3.264270981773734, + "learning_rate": 8.452691006324054e-07, + "loss": 0.3264, + "num_tokens": 11453048.0, + "reward": 0.7388916015625, + "reward_std": 0.01705537736415863, + "rewards//mean": 0.7388916015625, + "rewards//std": 0.04073323681950569, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2652, + "grad_norm": 6.3131914138793945, + "kl": 2.1231815684586763, + "learning_rate": 8.45039505741056e-07, + "loss": 0.2123, + "num_tokens": 11461712.0, + "reward": 0.74981689453125, + "reward_std": 0.005183476489037275, + "rewards//mean": 0.74981689453125, + "rewards//std": 0.036765486001968384, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2654, + "grad_norm": 4.836770534515381, + "kl": 1.5332971066236496, + "learning_rate": 8.448097718678348e-07, + "loss": 0.1533, + "num_tokens": 11470416.0, + "reward": 0.745849609375, + "reward_std": 0.011320450343191624, + "rewards//mean": 0.745849609375, + "rewards//std": 0.03728518262505531, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2656, + "grad_norm": 6.643113136291504, + "kl": 2.7718011625111103, + "learning_rate": 8.44579899105279e-07, + "loss": 0.2772, + "num_tokens": 11479032.0, + "reward": 0.74407958984375, + "reward_std": 0.01326703280210495, + "rewards//mean": 0.74407958984375, + "rewards//std": 0.029495583847165108, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2658, + "grad_norm": 2.29469895362854, + "kl": 1.0763975717127323, + "learning_rate": 8.443498875459808e-07, + "loss": 0.1076, + "num_tokens": 11487568.0, + "reward": 0.72308349609375, + "reward_std": 0.010130094364285469, + "rewards//mean": 0.72308349609375, + "rewards//std": 0.03735165670514107, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.266, + "grad_norm": 2.7798666954040527, + "kl": 1.92560656927526, + "learning_rate": 8.441197372825892e-07, + "loss": 0.1926, + "num_tokens": 11496160.0, + "reward": 0.7310791015625, + "reward_std": 0.013944422826170921, + "rewards//mean": 0.7310791015625, + "rewards//std": 0.0374300442636013, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2662, + "grad_norm": 4.736409664154053, + "kl": 1.1363797690719366, + "learning_rate": 8.438894484078085e-07, + "loss": 0.1136, + "num_tokens": 11504792.0, + "reward": 0.73931884765625, + "reward_std": 0.007724676746875048, + "rewards//mean": 0.73931884765625, + "rewards//std": 0.029308177530765533, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2664, + "grad_norm": 3.171586275100708, + "kl": 2.3072773162275553, + "learning_rate": 8.43659021014399e-07, + "loss": 0.2307, + "num_tokens": 11513432.0, + "reward": 0.75714111328125, + "reward_std": 0.015399273484945297, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.023984549567103386, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2666, + "grad_norm": 2.667379856109619, + "kl": 1.5344881527125835, + "learning_rate": 8.434284551951772e-07, + "loss": 0.1534, + "num_tokens": 11522112.0, + "reward": 0.77593994140625, + "reward_std": 0.01357704121619463, + "rewards//mean": 0.77593994140625, + "rewards//std": 0.029375243932008743, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2668, + "grad_norm": 2.44822359085083, + "kl": 1.4912556074559689, + "learning_rate": 8.431977510430145e-07, + "loss": 0.1491, + "num_tokens": 11530648.0, + "reward": 0.72918701171875, + "reward_std": 0.014410671778023243, + "rewards//mean": 0.72918701171875, + "rewards//std": 0.041754089295864105, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.267, + "grad_norm": 3.6548104286193848, + "kl": 1.5616823472082615, + "learning_rate": 8.429669086508389e-07, + "loss": 0.1562, + "num_tokens": 11539376.0, + "reward": 0.7783203125, + "reward_std": 0.02049800381064415, + "rewards//mean": 0.7783203125, + "rewards//std": 0.04042753949761391, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2672, + "grad_norm": 4.781790733337402, + "kl": 1.5930482968688011, + "learning_rate": 8.427359281116333e-07, + "loss": 0.1593, + "num_tokens": 11548024.0, + "reward": 0.7432861328125, + "reward_std": 0.011560136452317238, + "rewards//mean": 0.7432861328125, + "rewards//std": 0.0274319089949131, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2674, + "grad_norm": 5.124735355377197, + "kl": 1.7172706872224808, + "learning_rate": 8.42504809518437e-07, + "loss": 0.1717, + "num_tokens": 11556656.0, + "reward": 0.75689697265625, + "reward_std": 0.012276984751224518, + "rewards//mean": 0.75689697265625, + "rewards//std": 0.02750485949218273, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2676, + "grad_norm": 7.540124416351318, + "kl": 1.4192634783685207, + "learning_rate": 8.422735529643443e-07, + "loss": 0.1419, + "num_tokens": 11565296.0, + "reward": 0.76983642578125, + "reward_std": 0.008768022060394287, + "rewards//mean": 0.76983642578125, + "rewards//std": 0.025960668921470642, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2678, + "grad_norm": 4.039872169494629, + "kl": 1.52224126085639, + "learning_rate": 8.420421585425055e-07, + "loss": 0.1522, + "num_tokens": 11573864.0, + "reward": 0.739013671875, + "reward_std": 0.011027004569768906, + "rewards//mean": 0.739013671875, + "rewards//std": 0.033275991678237915, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.268, + "grad_norm": 7.840274810791016, + "kl": 1.426694292575121, + "learning_rate": 8.41810626346126e-07, + "loss": 0.1427, + "num_tokens": 11582424.0, + "reward": 0.7435302734375, + "reward_std": 0.006033569574356079, + "rewards//mean": 0.7435302734375, + "rewards//std": 0.03037925995886326, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2682, + "grad_norm": 41.64866256713867, + "kl": 0.9570504669100046, + "learning_rate": 8.415789564684673e-07, + "loss": 0.0957, + "num_tokens": 11590968.0, + "reward": 0.793212890625, + "reward_std": 0.008649616502225399, + "rewards//mean": 0.793212890625, + "rewards//std": 0.018556727096438408, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2684, + "grad_norm": 5.264158725738525, + "kl": 1.6046327948570251, + "learning_rate": 8.413471490028455e-07, + "loss": 0.1605, + "num_tokens": 11599640.0, + "reward": 0.77490234375, + "reward_std": 0.011650312691926956, + "rewards//mean": 0.77490234375, + "rewards//std": 0.024404451251029968, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2686, + "grad_norm": 17.011680603027344, + "kl": 1.7282491251826286, + "learning_rate": 8.41115204042633e-07, + "loss": 0.1728, + "num_tokens": 11608232.0, + "reward": 0.7513427734375, + "reward_std": 0.009822498075664043, + "rewards//mean": 0.7513427734375, + "rewards//std": 0.03263173997402191, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2688, + "grad_norm": 5.446418762207031, + "kl": 1.3230797797441483, + "learning_rate": 8.408831216812573e-07, + "loss": 0.1323, + "num_tokens": 11616920.0, + "reward": 0.7596435546875, + "reward_std": 0.008739085868000984, + "rewards//mean": 0.7596435546875, + "rewards//std": 0.04049918055534363, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.269, + "grad_norm": 41.21394348144531, + "kl": 2.531970787793398, + "learning_rate": 8.406509020122008e-07, + "loss": 0.2532, + "num_tokens": 11625560.0, + "reward": 0.78192138671875, + "reward_std": 0.00816678162664175, + "rewards//mean": 0.78192138671875, + "rewards//std": 0.02970474772155285, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2692, + "grad_norm": 31.490951538085938, + "kl": 2.656165039166808, + "learning_rate": 8.404185451290017e-07, + "loss": 0.2656, + "num_tokens": 11634256.0, + "reward": 0.759765625, + "reward_std": 0.014978468418121338, + "rewards//mean": 0.759765625, + "rewards//std": 0.042722851037979126, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2694, + "grad_norm": 43.99723815917969, + "kl": 2.4182144086807966, + "learning_rate": 8.401860511252533e-07, + "loss": 0.2418, + "num_tokens": 11642840.0, + "reward": 0.7490234375, + "reward_std": 0.01979096420109272, + "rewards//mean": 0.7490234375, + "rewards//std": 0.03907490149140358, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2696, + "grad_norm": 21.77730941772461, + "kl": 1.8901789207011461, + "learning_rate": 8.399534200946043e-07, + "loss": 0.189, + "num_tokens": 11651408.0, + "reward": 0.74652099609375, + "reward_std": 0.014805897139012814, + "rewards//mean": 0.74652099609375, + "rewards//std": 0.03428862988948822, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2698, + "grad_norm": 11.703360557556152, + "kl": 2.190919779241085, + "learning_rate": 8.397206521307583e-07, + "loss": 0.2191, + "num_tokens": 11660048.0, + "reward": 0.75946044921875, + "reward_std": 0.005028828978538513, + "rewards//mean": 0.75946044921875, + "rewards//std": 0.02802877128124237, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.27, + "grad_norm": 1.4796116352081299, + "kl": 1.4740431364625692, + "learning_rate": 8.394877473274741e-07, + "loss": 0.1474, + "num_tokens": 11668728.0, + "reward": 0.74029541015625, + "reward_std": 0.006811236031353474, + "rewards//mean": 0.74029541015625, + "rewards//std": 0.032652318477630615, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2702, + "grad_norm": 3.8744845390319824, + "kl": 1.435689877718687, + "learning_rate": 8.392547057785661e-07, + "loss": 0.1436, + "num_tokens": 11677376.0, + "reward": 0.7760009765625, + "reward_std": 0.015611299313604832, + "rewards//mean": 0.7760009765625, + "rewards//std": 0.030098924413323402, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2704, + "grad_norm": 6.678821563720703, + "kl": 1.6774704921990633, + "learning_rate": 8.39021527577903e-07, + "loss": 0.1677, + "num_tokens": 11686016.0, + "reward": 0.7735595703125, + "reward_std": 0.017164621502161026, + "rewards//mean": 0.7735595703125, + "rewards//std": 0.03084409609436989, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2706, + "grad_norm": 2.4394986629486084, + "kl": 1.917915841564536, + "learning_rate": 8.387882128194092e-07, + "loss": 0.1918, + "num_tokens": 11694712.0, + "reward": 0.7869873046875, + "reward_std": 0.016864636912941933, + "rewards//mean": 0.7869873046875, + "rewards//std": 0.03363863006234169, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2708, + "grad_norm": 2.828508138656616, + "kl": 1.0346383973956108, + "learning_rate": 8.385547615970638e-07, + "loss": 0.1035, + "num_tokens": 11703384.0, + "reward": 0.7320556640625, + "reward_std": 0.008335303515195847, + "rewards//mean": 0.7320556640625, + "rewards//std": 0.02160627581179142, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.271, + "grad_norm": 3.749871015548706, + "kl": 1.5432432442903519, + "learning_rate": 8.38321174004901e-07, + "loss": 0.1543, + "num_tokens": 11712016.0, + "reward": 0.7733154296875, + "reward_std": 0.012710566632449627, + "rewards//mean": 0.7733154296875, + "rewards//std": 0.02752446010708809, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2712, + "grad_norm": 2.655707597732544, + "kl": 1.4231681134551764, + "learning_rate": 8.380874501370097e-07, + "loss": 0.1423, + "num_tokens": 11720648.0, + "reward": 0.7662353515625, + "reward_std": 0.010058829560875893, + "rewards//mean": 0.7662353515625, + "rewards//std": 0.030403168871998787, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2714, + "grad_norm": 8.475590705871582, + "kl": 1.1434898935258389, + "learning_rate": 8.378535900875338e-07, + "loss": 0.1143, + "num_tokens": 11729304.0, + "reward": 0.73297119140625, + "reward_std": 0.013280518352985382, + "rewards//mean": 0.73297119140625, + "rewards//std": 0.032018449157476425, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2716, + "grad_norm": 5.44661283493042, + "kl": 1.1054871659725904, + "learning_rate": 8.376195939506725e-07, + "loss": 0.1105, + "num_tokens": 11738040.0, + "reward": 0.734619140625, + "reward_std": 0.008744160644710064, + "rewards//mean": 0.734619140625, + "rewards//std": 0.031225770711898804, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2718, + "grad_norm": 11.936436653137207, + "kl": 2.7080367766320705, + "learning_rate": 8.373854618206789e-07, + "loss": 0.2708, + "num_tokens": 11746752.0, + "reward": 0.72198486328125, + "reward_std": 0.01292148232460022, + "rewards//mean": 0.72198486328125, + "rewards//std": 0.04071144387125969, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.272, + "grad_norm": 2.589662551879883, + "kl": 1.080529686063528, + "learning_rate": 8.371511937918617e-07, + "loss": 0.1081, + "num_tokens": 11755448.0, + "reward": 0.72021484375, + "reward_std": 0.007460972294211388, + "rewards//mean": 0.72021484375, + "rewards//std": 0.03240293264389038, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2722, + "grad_norm": 2.6604063510894775, + "kl": 1.9544231854379177, + "learning_rate": 8.369167899585839e-07, + "loss": 0.1954, + "num_tokens": 11764152.0, + "reward": 0.759033203125, + "reward_std": 0.012583325617015362, + "rewards//mean": 0.759033203125, + "rewards//std": 0.033967550843954086, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2724, + "grad_norm": 7.418876647949219, + "kl": 0.8234708085656166, + "learning_rate": 8.366822504152636e-07, + "loss": 0.0823, + "num_tokens": 11772776.0, + "reward": 0.73114013671875, + "reward_std": 0.0036419208627194166, + "rewards//mean": 0.73114013671875, + "rewards//std": 0.028743363916873932, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2726, + "grad_norm": 3.678071975708008, + "kl": 1.141478419303894, + "learning_rate": 8.364475752563728e-07, + "loss": 0.1141, + "num_tokens": 11781408.0, + "reward": 0.75323486328125, + "reward_std": 0.008702388033270836, + "rewards//mean": 0.75323486328125, + "rewards//std": 0.034319959580898285, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2728, + "grad_norm": 2.519693374633789, + "kl": 0.5582467820495367, + "learning_rate": 8.362127645764389e-07, + "loss": 0.0558, + "num_tokens": 11789976.0, + "reward": 0.75274658203125, + "reward_std": 0.0028330846689641476, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.025052646175026894, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.273, + "grad_norm": 4.423770427703857, + "kl": 2.3881990388035774, + "learning_rate": 8.359778184700439e-07, + "loss": 0.2388, + "num_tokens": 11798632.0, + "reward": 0.79425048828125, + "reward_std": 0.02025095745921135, + "rewards//mean": 0.79425048828125, + "rewards//std": 0.032324761152267456, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2732, + "grad_norm": 3.8239078521728516, + "kl": 1.0817086547613144, + "learning_rate": 8.357427370318238e-07, + "loss": 0.1082, + "num_tokens": 11807256.0, + "reward": 0.76885986328125, + "reward_std": 0.007265827618539333, + "rewards//mean": 0.76885986328125, + "rewards//std": 0.025167187675833702, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2734, + "grad_norm": 3.446352481842041, + "kl": 0.7759816534817219, + "learning_rate": 8.355075203564692e-07, + "loss": 0.0776, + "num_tokens": 11815920.0, + "reward": 0.75262451171875, + "reward_std": 0.00164124951697886, + "rewards//mean": 0.75262451171875, + "rewards//std": 0.028603991493582726, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2736, + "grad_norm": 10.429774284362793, + "kl": 1.9041251055896282, + "learning_rate": 8.352721685387256e-07, + "loss": 0.1904, + "num_tokens": 11824560.0, + "reward": 0.7603759765625, + "reward_std": 0.011990025639533997, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.033329401165246964, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2738, + "grad_norm": 5.327267169952393, + "kl": 1.3947375752031803, + "learning_rate": 8.350366816733926e-07, + "loss": 0.1395, + "num_tokens": 11833184.0, + "reward": 0.7496337890625, + "reward_std": 0.011457724496722221, + "rewards//mean": 0.7496337890625, + "rewards//std": 0.03452516347169876, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.274, + "grad_norm": 2.0751752853393555, + "kl": 1.0512386709451675, + "learning_rate": 8.348010598553243e-07, + "loss": 0.1051, + "num_tokens": 11841856.0, + "reward": 0.75750732421875, + "reward_std": 0.005020422860980034, + "rewards//mean": 0.75750732421875, + "rewards//std": 0.022939324378967285, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2742, + "grad_norm": 7.620663642883301, + "kl": 1.806717999279499, + "learning_rate": 8.34565303179429e-07, + "loss": 0.1807, + "num_tokens": 11850632.0, + "reward": 0.7593994140625, + "reward_std": 0.008870774880051613, + "rewards//mean": 0.7593994140625, + "rewards//std": 0.0319984070956707, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2744, + "grad_norm": 1.1963679790496826, + "kl": 0.5940828789025545, + "learning_rate": 8.343294117406698e-07, + "loss": 0.0594, + "num_tokens": 11859328.0, + "reward": 0.76617431640625, + "reward_std": 0.003053711960092187, + "rewards//mean": 0.76617431640625, + "rewards//std": 0.023155411705374718, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2746, + "grad_norm": 7.2105231285095215, + "kl": 1.6511837430298328, + "learning_rate": 8.340933856340635e-07, + "loss": 0.1651, + "num_tokens": 11867920.0, + "reward": 0.7418212890625, + "reward_std": 0.0061455387622118, + "rewards//mean": 0.7418212890625, + "rewards//std": 0.033274855464696884, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2748, + "grad_norm": 4.255744457244873, + "kl": 1.6649334002286196, + "learning_rate": 8.338572249546812e-07, + "loss": 0.1665, + "num_tokens": 11876616.0, + "reward": 0.75311279296875, + "reward_std": 0.006844652350991964, + "rewards//mean": 0.75311279296875, + "rewards//std": 0.035673823207616806, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.275, + "grad_norm": 4.840819358825684, + "kl": 1.4184761084616184, + "learning_rate": 8.336209297976489e-07, + "loss": 0.1418, + "num_tokens": 11885248.0, + "reward": 0.77081298828125, + "reward_std": 0.009416550397872925, + "rewards//mean": 0.77081298828125, + "rewards//std": 0.020740758627653122, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2752, + "grad_norm": 9.01152515411377, + "kl": 2.583045953884721, + "learning_rate": 8.333845002581458e-07, + "loss": 0.2583, + "num_tokens": 11893872.0, + "reward": 0.75299072265625, + "reward_std": 0.0110011612996459, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.029860787093639374, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2754, + "grad_norm": 3.243278980255127, + "kl": 1.8015248626470566, + "learning_rate": 8.331479364314059e-07, + "loss": 0.1802, + "num_tokens": 11902448.0, + "reward": 0.75286865234375, + "reward_std": 0.009883337654173374, + "rewards//mean": 0.75286865234375, + "rewards//std": 0.029341213405132294, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2756, + "grad_norm": 5.122808933258057, + "kl": 1.906004762277007, + "learning_rate": 8.32911238412717e-07, + "loss": 0.1906, + "num_tokens": 11911064.0, + "reward": 0.78759765625, + "reward_std": 0.01040503941476345, + "rewards//mean": 0.78759765625, + "rewards//std": 0.029433997347950935, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2758, + "grad_norm": 8.389034271240234, + "kl": 0.8222418315708637, + "learning_rate": 8.326744062974211e-07, + "loss": 0.0822, + "num_tokens": 11919760.0, + "reward": 0.74853515625, + "reward_std": 0.004016595426946878, + "rewards//mean": 0.74853515625, + "rewards//std": 0.02361765317618847, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.276, + "grad_norm": 4.624485492706299, + "kl": 1.3433728516101837, + "learning_rate": 8.324374401809142e-07, + "loss": 0.1343, + "num_tokens": 11928344.0, + "reward": 0.72930908203125, + "reward_std": 0.012796587310731411, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.04307110235095024, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2762, + "grad_norm": 14.707901954650879, + "kl": 0.9655712600797415, + "learning_rate": 8.322003401586461e-07, + "loss": 0.0966, + "num_tokens": 11936976.0, + "reward": 0.77117919921875, + "reward_std": 0.0059419069439172745, + "rewards//mean": 0.77117919921875, + "rewards//std": 0.02969047613441944, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2764, + "grad_norm": 2.6805195808410645, + "kl": 1.6826000418514013, + "learning_rate": 8.319631063261207e-07, + "loss": 0.1683, + "num_tokens": 11945576.0, + "reward": 0.755615234375, + "reward_std": 0.008507579565048218, + "rewards//mean": 0.755615234375, + "rewards//std": 0.025703487917780876, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2766, + "grad_norm": 0.2917509973049164, + "kl": 0.4393001478165388, + "learning_rate": 8.317257387788958e-07, + "loss": 0.0439, + "num_tokens": 11954136.0, + "reward": 0.74420166015625, + "reward_std": 0.0009134745923802257, + "rewards//mean": 0.74420166015625, + "rewards//std": 0.022498900070786476, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2768, + "grad_norm": 6.497793674468994, + "kl": 1.4654463231563568, + "learning_rate": 8.314882376125831e-07, + "loss": 0.1465, + "num_tokens": 11962824.0, + "reward": 0.783447265625, + "reward_std": 0.01338261365890503, + "rewards//mean": 0.783447265625, + "rewards//std": 0.03168009966611862, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.277, + "grad_norm": 1.3848124742507935, + "kl": 0.572774613276124, + "learning_rate": 8.312506029228477e-07, + "loss": 0.0573, + "num_tokens": 11971504.0, + "reward": 0.75177001953125, + "reward_std": 0.003506066743284464, + "rewards//mean": 0.75177001953125, + "rewards//std": 0.029289059340953827, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2772, + "grad_norm": 9.892672538757324, + "kl": 2.5279021225869656, + "learning_rate": 8.310128348054093e-07, + "loss": 0.2528, + "num_tokens": 11980152.0, + "reward": 0.7596435546875, + "reward_std": 0.00803013239055872, + "rewards//mean": 0.7596435546875, + "rewards//std": 0.0346529558300972, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2774, + "grad_norm": 1.4608051776885986, + "kl": 0.91611497849226, + "learning_rate": 8.307749333560404e-07, + "loss": 0.0916, + "num_tokens": 11988744.0, + "reward": 0.767333984375, + "reward_std": 0.00652629230171442, + "rewards//mean": 0.767333984375, + "rewards//std": 0.021405315026640892, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2776, + "grad_norm": 19.319339752197266, + "kl": 0.8584948647767305, + "learning_rate": 8.305368986705681e-07, + "loss": 0.0858, + "num_tokens": 11997312.0, + "reward": 0.73516845703125, + "reward_std": 0.00625626090914011, + "rewards//mean": 0.73516845703125, + "rewards//std": 0.034359633922576904, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2778, + "grad_norm": 3.730419397354126, + "kl": 1.312489127740264, + "learning_rate": 8.302987308448723e-07, + "loss": 0.1312, + "num_tokens": 12005928.0, + "reward": 0.759033203125, + "reward_std": 0.010758569464087486, + "rewards//mean": 0.759033203125, + "rewards//std": 0.03963417932391167, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.278, + "grad_norm": 2.4365758895874023, + "kl": 0.5073908474296331, + "learning_rate": 8.300604299748874e-07, + "loss": 0.0507, + "num_tokens": 12014688.0, + "reward": 0.74853515625, + "reward_std": 0.0016159163787961006, + "rewards//mean": 0.74853515625, + "rewards//std": 0.02691522240638733, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2782, + "grad_norm": 13.980290412902832, + "kl": 2.666330335661769, + "learning_rate": 8.298219961566008e-07, + "loss": 0.2666, + "num_tokens": 12023352.0, + "reward": 0.72003173828125, + "reward_std": 0.015395499765872955, + "rewards//mean": 0.72003173828125, + "rewards//std": 0.03750009462237358, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2784, + "grad_norm": 9.084924697875977, + "kl": 1.347284598276019, + "learning_rate": 8.295834294860534e-07, + "loss": 0.1347, + "num_tokens": 12031984.0, + "reward": 0.75323486328125, + "reward_std": 0.0059725199826061726, + "rewards//mean": 0.75323486328125, + "rewards//std": 0.028861092403531075, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2786, + "grad_norm": 4.401847839355469, + "kl": 1.36901949159801, + "learning_rate": 8.293447300593402e-07, + "loss": 0.1369, + "num_tokens": 12040632.0, + "reward": 0.754150390625, + "reward_std": 0.01407884992659092, + "rewards//mean": 0.754150390625, + "rewards//std": 0.03121025487780571, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2788, + "grad_norm": 6.017202377319336, + "kl": 1.452460439875722, + "learning_rate": 8.291058979726091e-07, + "loss": 0.1452, + "num_tokens": 12049192.0, + "reward": 0.7608642578125, + "reward_std": 0.006034743040800095, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.030595744028687477, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.279, + "grad_norm": 1.6311331987380981, + "kl": 1.444918017834425, + "learning_rate": 8.288669333220614e-07, + "loss": 0.1445, + "num_tokens": 12057776.0, + "reward": 0.7515869140625, + "reward_std": 0.010062210261821747, + "rewards//mean": 0.7515869140625, + "rewards//std": 0.021784896031022072, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2792, + "grad_norm": 5.6906328201293945, + "kl": 2.264435239136219, + "learning_rate": 8.286278362039527e-07, + "loss": 0.2264, + "num_tokens": 12066448.0, + "reward": 0.76422119140625, + "reward_std": 0.011034558527171612, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.028188716620206833, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2794, + "grad_norm": 1.753432273864746, + "kl": 1.2903249748051167, + "learning_rate": 8.283886067145906e-07, + "loss": 0.129, + "num_tokens": 12075056.0, + "reward": 0.7764892578125, + "reward_std": 0.012123174034059048, + "rewards//mean": 0.7764892578125, + "rewards//std": 0.0373992957174778, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2796, + "grad_norm": 4.2095818519592285, + "kl": 1.2844493500888348, + "learning_rate": 8.281492449503372e-07, + "loss": 0.1284, + "num_tokens": 12083688.0, + "reward": 0.763427734375, + "reward_std": 0.007913639768958092, + "rewards//mean": 0.763427734375, + "rewards//std": 0.025285478681325912, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2798, + "grad_norm": 3.5186710357666016, + "kl": 1.1074212044477463, + "learning_rate": 8.279097510076069e-07, + "loss": 0.1107, + "num_tokens": 12092360.0, + "reward": 0.7769775390625, + "reward_std": 0.009716667234897614, + "rewards//mean": 0.7769775390625, + "rewards//std": 0.033347565680742264, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.28, + "grad_norm": 1.393874168395996, + "kl": 1.345550624653697, + "learning_rate": 8.276701249828684e-07, + "loss": 0.1346, + "num_tokens": 12100960.0, + "reward": 0.75579833984375, + "reward_std": 0.009464503265917301, + "rewards//mean": 0.75579833984375, + "rewards//std": 0.03382548317313194, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2802, + "grad_norm": 3.9176719188690186, + "kl": 0.874677112326026, + "learning_rate": 8.274303669726426e-07, + "loss": 0.0875, + "num_tokens": 12109584.0, + "reward": 0.764404296875, + "reward_std": 0.006336133927106857, + "rewards//mean": 0.764404296875, + "rewards//std": 0.03806307911872864, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2804, + "grad_norm": 11.06923770904541, + "kl": 1.2614341340959072, + "learning_rate": 8.271904770735041e-07, + "loss": 0.1261, + "num_tokens": 12118208.0, + "reward": 0.77410888671875, + "reward_std": 0.010842102579772472, + "rewards//mean": 0.77410888671875, + "rewards//std": 0.029729198664426804, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2806, + "grad_norm": 4.535281658172607, + "kl": 1.4729410503059626, + "learning_rate": 8.269504553820805e-07, + "loss": 0.1473, + "num_tokens": 12126832.0, + "reward": 0.78204345703125, + "reward_std": 0.008088983595371246, + "rewards//mean": 0.78204345703125, + "rewards//std": 0.026659158989787102, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2808, + "grad_norm": 3.3165090084075928, + "kl": 1.9795909393578768, + "learning_rate": 8.267103019950528e-07, + "loss": 0.198, + "num_tokens": 12135432.0, + "reward": 0.71868896484375, + "reward_std": 0.010753355920314789, + "rewards//mean": 0.71868896484375, + "rewards//std": 0.03845316171646118, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.281, + "grad_norm": 6.39655876159668, + "kl": 1.7917762715369463, + "learning_rate": 8.264700170091543e-07, + "loss": 0.1792, + "num_tokens": 12143992.0, + "reward": 0.75311279296875, + "reward_std": 0.011714443564414978, + "rewards//mean": 0.75311279296875, + "rewards//std": 0.02290630340576172, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2812, + "grad_norm": 15.693798065185547, + "kl": 2.989787459373474, + "learning_rate": 8.262296005211721e-07, + "loss": 0.299, + "num_tokens": 12152592.0, + "reward": 0.74774169921875, + "reward_std": 0.009115074761211872, + "rewards//mean": 0.74774169921875, + "rewards//std": 0.03236033394932747, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2814, + "grad_norm": 2.443939208984375, + "kl": 1.4647480100393295, + "learning_rate": 8.259890526279459e-07, + "loss": 0.1465, + "num_tokens": 12161296.0, + "reward": 0.7900390625, + "reward_std": 0.010983582586050034, + "rewards//mean": 0.7900390625, + "rewards//std": 0.02108754962682724, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2816, + "grad_norm": 6.9362592697143555, + "kl": 2.383898377418518, + "learning_rate": 8.257483734263681e-07, + "loss": 0.2384, + "num_tokens": 12169976.0, + "reward": 0.7171630859375, + "reward_std": 0.01384001411497593, + "rewards//mean": 0.7171630859375, + "rewards//std": 0.04279489442706108, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2818, + "grad_norm": 6.503877639770508, + "kl": 0.9984986390918493, + "learning_rate": 8.255075630133845e-07, + "loss": 0.0998, + "num_tokens": 12178552.0, + "reward": 0.77008056640625, + "reward_std": 0.00814627856016159, + "rewards//mean": 0.77008056640625, + "rewards//std": 0.019782207906246185, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.282, + "grad_norm": 4.591920852661133, + "kl": 1.442638648673892, + "learning_rate": 8.252666214859934e-07, + "loss": 0.1443, + "num_tokens": 12187224.0, + "reward": 0.73504638671875, + "reward_std": 0.007913686335086823, + "rewards//mean": 0.73504638671875, + "rewards//std": 0.03350578248500824, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2822, + "grad_norm": 2.359144687652588, + "kl": 1.9792951606214046, + "learning_rate": 8.250255489412462e-07, + "loss": 0.1979, + "num_tokens": 12195976.0, + "reward": 0.75262451171875, + "reward_std": 0.013961941003799438, + "rewards//mean": 0.75262451171875, + "rewards//std": 0.03549600765109062, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2824, + "grad_norm": 5.347057342529297, + "kl": 1.660464035347104, + "learning_rate": 8.247843454762466e-07, + "loss": 0.166, + "num_tokens": 12204544.0, + "reward": 0.76068115234375, + "reward_std": 0.019290976226329803, + "rewards//mean": 0.76068115234375, + "rewards//std": 0.03252783417701721, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2826, + "grad_norm": 4.361217021942139, + "kl": 2.0865957494825125, + "learning_rate": 8.245430111881517e-07, + "loss": 0.2087, + "num_tokens": 12213224.0, + "reward": 0.775634765625, + "reward_std": 0.02434913069009781, + "rewards//mean": 0.775634765625, + "rewards//std": 0.0416659414768219, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2828, + "grad_norm": 6.953249454498291, + "kl": 1.9759225770831108, + "learning_rate": 8.243015461741706e-07, + "loss": 0.1976, + "num_tokens": 12221920.0, + "reward": 0.7635498046875, + "reward_std": 0.011011242866516113, + "rewards//mean": 0.7635498046875, + "rewards//std": 0.03638497740030289, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.283, + "grad_norm": 3.491920232772827, + "kl": 0.8142143860459328, + "learning_rate": 8.240599505315654e-07, + "loss": 0.0814, + "num_tokens": 12230600.0, + "reward": 0.764404296875, + "reward_std": 0.005777326878160238, + "rewards//mean": 0.764404296875, + "rewards//std": 0.0344771184027195, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2832, + "grad_norm": 4.880618095397949, + "kl": 1.9639583434909582, + "learning_rate": 8.238182243576511e-07, + "loss": 0.1964, + "num_tokens": 12239248.0, + "reward": 0.7703857421875, + "reward_std": 0.011037491261959076, + "rewards//mean": 0.7703857421875, + "rewards//std": 0.031300097703933716, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2834, + "grad_norm": 3.7134809494018555, + "kl": 1.022446770220995, + "learning_rate": 8.235763677497945e-07, + "loss": 0.1022, + "num_tokens": 12247824.0, + "reward": 0.7918701171875, + "reward_std": 0.008282292634248734, + "rewards//mean": 0.7918701171875, + "rewards//std": 0.024094512686133385, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2836, + "grad_norm": 5.692349910736084, + "kl": 1.8521916195750237, + "learning_rate": 8.233343808054157e-07, + "loss": 0.1852, + "num_tokens": 12256496.0, + "reward": 0.75799560546875, + "reward_std": 0.018615327775478363, + "rewards//mean": 0.75799560546875, + "rewards//std": 0.039850860834121704, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2838, + "grad_norm": 9.783778190612793, + "kl": 1.7258297987282276, + "learning_rate": 8.23092263621987e-07, + "loss": 0.1726, + "num_tokens": 12265152.0, + "reward": 0.7305908203125, + "reward_std": 0.008400633931159973, + "rewards//mean": 0.7305908203125, + "rewards//std": 0.03428758308291435, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.284, + "grad_norm": 3.2923576831817627, + "kl": 1.6641437392681837, + "learning_rate": 8.228500162970332e-07, + "loss": 0.1664, + "num_tokens": 12273816.0, + "reward": 0.740478515625, + "reward_std": 0.019815467298030853, + "rewards//mean": 0.740478515625, + "rewards//std": 0.050572752952575684, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2842, + "grad_norm": 9.795512199401855, + "kl": 1.4889667555689812, + "learning_rate": 8.226076389281314e-07, + "loss": 0.1489, + "num_tokens": 12282528.0, + "reward": 0.76580810546875, + "reward_std": 0.014538135379552841, + "rewards//mean": 0.76580810546875, + "rewards//std": 0.03402137756347656, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2844, + "grad_norm": 3.6462700366973877, + "kl": 1.158252900466323, + "learning_rate": 8.223651316129114e-07, + "loss": 0.1158, + "num_tokens": 12291168.0, + "reward": 0.72418212890625, + "reward_std": 0.007890328764915466, + "rewards//mean": 0.72418212890625, + "rewards//std": 0.02440561354160309, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2846, + "grad_norm": 7.619853973388672, + "kl": 0.9519822169095278, + "learning_rate": 8.221224944490548e-07, + "loss": 0.0952, + "num_tokens": 12299824.0, + "reward": 0.7584228515625, + "reward_std": 0.010799422860145569, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.034787241369485855, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2848, + "grad_norm": 2.0467350482940674, + "kl": 1.29447266086936, + "learning_rate": 8.21879727534296e-07, + "loss": 0.1294, + "num_tokens": 12308464.0, + "reward": 0.78607177734375, + "reward_std": 0.00892403069883585, + "rewards//mean": 0.78607177734375, + "rewards//std": 0.03117816522717476, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.285, + "grad_norm": 2.3626134395599365, + "kl": 0.8906732350587845, + "learning_rate": 8.216368309664213e-07, + "loss": 0.0891, + "num_tokens": 12316984.0, + "reward": 0.77587890625, + "reward_std": 0.005383210722357035, + "rewards//mean": 0.77587890625, + "rewards//std": 0.02964716963469982, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2852, + "grad_norm": 0.7195881009101868, + "kl": 0.7214030046015978, + "learning_rate": 8.213938048432696e-07, + "loss": 0.0721, + "num_tokens": 12325648.0, + "reward": 0.77227783203125, + "reward_std": 0.005835712421685457, + "rewards//mean": 0.77227783203125, + "rewards//std": 0.02550298348069191, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2854, + "grad_norm": 3.8188791275024414, + "kl": 1.3568202015012503, + "learning_rate": 8.211506492627318e-07, + "loss": 0.1357, + "num_tokens": 12334280.0, + "reward": 0.73175048828125, + "reward_std": 0.009329218417406082, + "rewards//mean": 0.73175048828125, + "rewards//std": 0.032556209713220596, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2856, + "grad_norm": 4.030502796173096, + "kl": 0.928346149623394, + "learning_rate": 8.209073643227509e-07, + "loss": 0.0928, + "num_tokens": 12342960.0, + "reward": 0.744140625, + "reward_std": 0.009696818888187408, + "rewards//mean": 0.744140625, + "rewards//std": 0.04009667783975601, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2858, + "grad_norm": 2.174384593963623, + "kl": 0.8410797268152237, + "learning_rate": 8.206639501213219e-07, + "loss": 0.0841, + "num_tokens": 12351680.0, + "reward": 0.7952880859375, + "reward_std": 0.005080194212496281, + "rewards//mean": 0.7952880859375, + "rewards//std": 0.023101497441530228, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.286, + "grad_norm": 7.616950035095215, + "kl": 1.8224603720009327, + "learning_rate": 8.204204067564924e-07, + "loss": 0.1822, + "num_tokens": 12360304.0, + "reward": 0.74810791015625, + "reward_std": 0.010432298295199871, + "rewards//mean": 0.74810791015625, + "rewards//std": 0.023281892761588097, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2862, + "grad_norm": 4.008780002593994, + "kl": 1.6188208274543285, + "learning_rate": 8.201767343263611e-07, + "loss": 0.1619, + "num_tokens": 12368984.0, + "reward": 0.7171630859375, + "reward_std": 0.009844319894909859, + "rewards//mean": 0.7171630859375, + "rewards//std": 0.03988299518823624, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2864, + "grad_norm": 2.8394522666931152, + "kl": 1.9309929125010967, + "learning_rate": 8.199329329290796e-07, + "loss": 0.1931, + "num_tokens": 12377728.0, + "reward": 0.76324462890625, + "reward_std": 0.0168285071849823, + "rewards//mean": 0.76324462890625, + "rewards//std": 0.03365769237279892, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2866, + "grad_norm": 3.6594748497009277, + "kl": 0.829441886395216, + "learning_rate": 8.19689002662851e-07, + "loss": 0.0829, + "num_tokens": 12386304.0, + "reward": 0.72027587890625, + "reward_std": 0.006811304483562708, + "rewards//mean": 0.72027587890625, + "rewards//std": 0.034573525190353394, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2868, + "grad_norm": 8.208528518676758, + "kl": 1.313731512054801, + "learning_rate": 8.194449436259303e-07, + "loss": 0.1314, + "num_tokens": 12394880.0, + "reward": 0.78411865234375, + "reward_std": 0.006044465582817793, + "rewards//mean": 0.78411865234375, + "rewards//std": 0.024678243324160576, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.287, + "grad_norm": 1.9686647653579712, + "kl": 0.4446337874978781, + "learning_rate": 8.192007559166247e-07, + "loss": 0.0445, + "num_tokens": 12403512.0, + "reward": 0.75067138671875, + "reward_std": 0.0007085598772391677, + "rewards//mean": 0.75067138671875, + "rewards//std": 0.02182750217616558, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2872, + "grad_norm": 1.48588228225708, + "kl": 1.269335813820362, + "learning_rate": 8.189564396332926e-07, + "loss": 0.1269, + "num_tokens": 12412144.0, + "reward": 0.77288818359375, + "reward_std": 0.0065114363096654415, + "rewards//mean": 0.77288818359375, + "rewards//std": 0.02634557895362377, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2874, + "grad_norm": 3.78623628616333, + "kl": 1.1646060831844807, + "learning_rate": 8.187119948743449e-07, + "loss": 0.1165, + "num_tokens": 12420840.0, + "reward": 0.74029541015625, + "reward_std": 0.00750130508095026, + "rewards//mean": 0.74029541015625, + "rewards//std": 0.02111041732132435, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2876, + "grad_norm": 4.180212497711182, + "kl": 0.7835429180413485, + "learning_rate": 8.184674217382437e-07, + "loss": 0.0784, + "num_tokens": 12429520.0, + "reward": 0.71539306640625, + "reward_std": 0.007275192998349667, + "rewards//mean": 0.71539306640625, + "rewards//std": 0.036230869591236115, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2878, + "grad_norm": 3.0557329654693604, + "kl": 1.472244618460536, + "learning_rate": 8.182227203235031e-07, + "loss": 0.1472, + "num_tokens": 12438240.0, + "reward": 0.754638671875, + "reward_std": 0.01097121275961399, + "rewards//mean": 0.754638671875, + "rewards//std": 0.032851118594408035, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.288, + "grad_norm": 2.872533082962036, + "kl": 1.2290428895503283, + "learning_rate": 8.179778907286887e-07, + "loss": 0.1229, + "num_tokens": 12446904.0, + "reward": 0.73321533203125, + "reward_std": 0.008530453778803349, + "rewards//mean": 0.73321533203125, + "rewards//std": 0.02781081199645996, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2882, + "grad_norm": 14.644976615905762, + "kl": 2.908546209335327, + "learning_rate": 8.177329330524181e-07, + "loss": 0.2909, + "num_tokens": 12455888.0, + "reward": 0.6920166015625, + "reward_std": 0.01498452853411436, + "rewards//mean": 0.6920166015625, + "rewards//std": 0.05833989009261131, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2884, + "grad_norm": 3.194599151611328, + "kl": 1.943106023594737, + "learning_rate": 8.1748784739336e-07, + "loss": 0.1943, + "num_tokens": 12464520.0, + "reward": 0.7371826171875, + "reward_std": 0.006786532700061798, + "rewards//mean": 0.7371826171875, + "rewards//std": 0.02880997397005558, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2886, + "grad_norm": 3.254049062728882, + "kl": 0.7659182902425528, + "learning_rate": 8.17242633850235e-07, + "loss": 0.0766, + "num_tokens": 12473136.0, + "reward": 0.75555419921875, + "reward_std": 0.00260446360334754, + "rewards//mean": 0.75555419921875, + "rewards//std": 0.020061077550053596, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2888, + "grad_norm": 4.499659538269043, + "kl": 1.9864756613969803, + "learning_rate": 8.16997292521815e-07, + "loss": 0.1986, + "num_tokens": 12481776.0, + "reward": 0.775390625, + "reward_std": 0.010401003062725067, + "rewards//mean": 0.775390625, + "rewards//std": 0.03470904380083084, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.289, + "grad_norm": 5.809593200683594, + "kl": 1.9560941476374865, + "learning_rate": 8.167518235069234e-07, + "loss": 0.1956, + "num_tokens": 12490408.0, + "reward": 0.76617431640625, + "reward_std": 0.01820463128387928, + "rewards//mean": 0.76617431640625, + "rewards//std": 0.03307150676846504, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2892, + "grad_norm": 3.595529079437256, + "kl": 1.4351292960345745, + "learning_rate": 8.165062269044352e-07, + "loss": 0.1435, + "num_tokens": 12498952.0, + "reward": 0.73309326171875, + "reward_std": 0.010006649419665337, + "rewards//mean": 0.73309326171875, + "rewards//std": 0.030346043407917023, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2894, + "grad_norm": 2.889712333679199, + "kl": 1.7049615774303675, + "learning_rate": 8.162605028132768e-07, + "loss": 0.1705, + "num_tokens": 12507640.0, + "reward": 0.7738037109375, + "reward_std": 0.010479219257831573, + "rewards//mean": 0.7738037109375, + "rewards//std": 0.025331832468509674, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2896, + "grad_norm": 12.884332656860352, + "kl": 2.1789142582565546, + "learning_rate": 8.160146513324254e-07, + "loss": 0.2179, + "num_tokens": 12516336.0, + "reward": 0.714599609375, + "reward_std": 0.0071917022578418255, + "rewards//mean": 0.714599609375, + "rewards//std": 0.03260691091418266, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2898, + "grad_norm": 3.4388458728790283, + "kl": 1.781784588471055, + "learning_rate": 8.157686725609105e-07, + "loss": 0.1782, + "num_tokens": 12524968.0, + "reward": 0.7850341796875, + "reward_std": 0.015044385567307472, + "rewards//mean": 0.7850341796875, + "rewards//std": 0.03369618207216263, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.29, + "grad_norm": 10.754867553710938, + "kl": 2.7751773670315742, + "learning_rate": 8.155225665978118e-07, + "loss": 0.2775, + "num_tokens": 12533680.0, + "reward": 0.740234375, + "reward_std": 0.013198032043874264, + "rewards//mean": 0.740234375, + "rewards//std": 0.036959774792194366, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2902, + "grad_norm": 4.8012261390686035, + "kl": 0.6618184391409159, + "learning_rate": 8.152763335422612e-07, + "loss": 0.0662, + "num_tokens": 12542256.0, + "reward": 0.75982666015625, + "reward_std": 0.006642960011959076, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.028561091050505638, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2904, + "grad_norm": 0.9024642109870911, + "kl": 0.6103915609419346, + "learning_rate": 8.150299734934412e-07, + "loss": 0.061, + "num_tokens": 12550904.0, + "reward": 0.73675537109375, + "reward_std": 0.0029347692616283894, + "rewards//mean": 0.73675537109375, + "rewards//std": 0.022995345294475555, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2906, + "grad_norm": 8.933744430541992, + "kl": 2.170983050018549, + "learning_rate": 8.147834865505853e-07, + "loss": 0.2171, + "num_tokens": 12559496.0, + "reward": 0.72320556640625, + "reward_std": 0.009682497940957546, + "rewards//mean": 0.72320556640625, + "rewards//std": 0.04538314789533615, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2908, + "grad_norm": 2.9089457988739014, + "kl": 0.6553179547190666, + "learning_rate": 8.145368728129789e-07, + "loss": 0.0655, + "num_tokens": 12568048.0, + "reward": 0.74383544921875, + "reward_std": 0.0035527851432561874, + "rewards//mean": 0.74383544921875, + "rewards//std": 0.025993304327130318, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.291, + "grad_norm": 2.4311299324035645, + "kl": 0.8503626752644777, + "learning_rate": 8.142901323799577e-07, + "loss": 0.085, + "num_tokens": 12576680.0, + "reward": 0.738525390625, + "reward_std": 0.004661278799176216, + "rewards//mean": 0.738525390625, + "rewards//std": 0.03340674936771393, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2912, + "grad_norm": 10.566218376159668, + "kl": 1.4250258300453424, + "learning_rate": 8.140432653509087e-07, + "loss": 0.1425, + "num_tokens": 12585336.0, + "reward": 0.761474609375, + "reward_std": 0.009308917447924614, + "rewards//mean": 0.761474609375, + "rewards//std": 0.03202228993177414, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2914, + "grad_norm": 4.898598670959473, + "kl": 1.3507120609283447, + "learning_rate": 8.1379627182527e-07, + "loss": 0.1351, + "num_tokens": 12593952.0, + "reward": 0.75079345703125, + "reward_std": 0.004424169193953276, + "rewards//mean": 0.75079345703125, + "rewards//std": 0.03937855735421181, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2916, + "grad_norm": 4.961860656738281, + "kl": 1.54135200381279, + "learning_rate": 8.135491519025306e-07, + "loss": 0.1541, + "num_tokens": 12602672.0, + "reward": 0.7462158203125, + "reward_std": 0.009684968739748001, + "rewards//mean": 0.7462158203125, + "rewards//std": 0.037782661616802216, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2918, + "grad_norm": 3.2244813442230225, + "kl": 1.0334641635417938, + "learning_rate": 8.133019056822302e-07, + "loss": 0.1033, + "num_tokens": 12611336.0, + "reward": 0.7684326171875, + "reward_std": 0.00849007535725832, + "rewards//mean": 0.7684326171875, + "rewards//std": 0.030028430745005608, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.292, + "grad_norm": 4.305205821990967, + "kl": 1.560680564492941, + "learning_rate": 8.130545332639597e-07, + "loss": 0.1561, + "num_tokens": 12619976.0, + "reward": 0.73626708984375, + "reward_std": 0.011509668081998825, + "rewards//mean": 0.73626708984375, + "rewards//std": 0.032196663320064545, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2922, + "grad_norm": 5.1138505935668945, + "kl": 1.9591032322496176, + "learning_rate": 8.128070347473608e-07, + "loss": 0.1959, + "num_tokens": 12628720.0, + "reward": 0.7720947265625, + "reward_std": 0.012277388945221901, + "rewards//mean": 0.7720947265625, + "rewards//std": 0.031205160543322563, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2924, + "grad_norm": 2.7395927906036377, + "kl": 0.6407976988703012, + "learning_rate": 8.125594102321255e-07, + "loss": 0.0641, + "num_tokens": 12637344.0, + "reward": 0.782470703125, + "reward_std": 0.0028777476400136948, + "rewards//mean": 0.782470703125, + "rewards//std": 0.028500672429800034, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2926, + "grad_norm": 4.052397727966309, + "kl": 0.8177272789180279, + "learning_rate": 8.123116598179971e-07, + "loss": 0.0818, + "num_tokens": 12646096.0, + "reward": 0.758544921875, + "reward_std": 0.008035607635974884, + "rewards//mean": 0.758544921875, + "rewards//std": 0.037711478769779205, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2928, + "grad_norm": 6.73102331161499, + "kl": 1.5023040790110826, + "learning_rate": 8.120637836047697e-07, + "loss": 0.1502, + "num_tokens": 12654704.0, + "reward": 0.74102783203125, + "reward_std": 0.01163367461413145, + "rewards//mean": 0.74102783203125, + "rewards//std": 0.03166331350803375, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.293, + "grad_norm": 8.07838249206543, + "kl": 1.9107473753392696, + "learning_rate": 8.118157816922874e-07, + "loss": 0.1911, + "num_tokens": 12663376.0, + "reward": 0.72015380859375, + "reward_std": 0.010127536021173, + "rewards//mean": 0.72015380859375, + "rewards//std": 0.03241688758134842, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2932, + "grad_norm": 7.672347545623779, + "kl": 1.8996401652693748, + "learning_rate": 8.115676541804455e-07, + "loss": 0.19, + "num_tokens": 12671992.0, + "reward": 0.7203369140625, + "reward_std": 0.01306104101240635, + "rewards//mean": 0.7203369140625, + "rewards//std": 0.05011012405157089, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2934, + "grad_norm": 4.347347259521484, + "kl": 1.485631575807929, + "learning_rate": 8.113194011691899e-07, + "loss": 0.1486, + "num_tokens": 12680584.0, + "reward": 0.74786376953125, + "reward_std": 0.014387952163815498, + "rewards//mean": 0.74786376953125, + "rewards//std": 0.03491033613681793, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2936, + "grad_norm": 1.2490336894989014, + "kl": 0.9858845826238394, + "learning_rate": 8.110710227585167e-07, + "loss": 0.0986, + "num_tokens": 12689168.0, + "reward": 0.77215576171875, + "reward_std": 0.008455757983028889, + "rewards//mean": 0.77215576171875, + "rewards//std": 0.023908693343400955, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2938, + "grad_norm": 4.55409574508667, + "kl": 1.7242893744260073, + "learning_rate": 8.108225190484726e-07, + "loss": 0.1724, + "num_tokens": 12697840.0, + "reward": 0.7286376953125, + "reward_std": 0.014607956632971764, + "rewards//mean": 0.7286376953125, + "rewards//std": 0.04412269964814186, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.294, + "grad_norm": 2.291954278945923, + "kl": 2.1257865503430367, + "learning_rate": 8.105738901391551e-07, + "loss": 0.2126, + "num_tokens": 12706584.0, + "reward": 0.771240234375, + "reward_std": 0.014790792018175125, + "rewards//mean": 0.771240234375, + "rewards//std": 0.035096779465675354, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2942, + "grad_norm": 7.725774765014648, + "kl": 0.7946789208799601, + "learning_rate": 8.103251361307118e-07, + "loss": 0.0795, + "num_tokens": 12715280.0, + "reward": 0.74493408203125, + "reward_std": 0.0026413900777697563, + "rewards//mean": 0.74493408203125, + "rewards//std": 0.026727208867669106, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2944, + "grad_norm": 1.7929853200912476, + "kl": 1.6168434005230665, + "learning_rate": 8.100762571233408e-07, + "loss": 0.1617, + "num_tokens": 12723888.0, + "reward": 0.764892578125, + "reward_std": 0.011940587311983109, + "rewards//mean": 0.764892578125, + "rewards//std": 0.030583124607801437, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2946, + "grad_norm": 2.7661356925964355, + "kl": 1.3101904802024364, + "learning_rate": 8.098272532172905e-07, + "loss": 0.131, + "num_tokens": 12732704.0, + "reward": 0.73382568359375, + "reward_std": 0.011408751830458641, + "rewards//mean": 0.73382568359375, + "rewards//std": 0.03307882696390152, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2948, + "grad_norm": 3.110581159591675, + "kl": 1.0171023327857256, + "learning_rate": 8.095781245128597e-07, + "loss": 0.1017, + "num_tokens": 12741424.0, + "reward": 0.7440185546875, + "reward_std": 0.007515724282711744, + "rewards//mean": 0.7440185546875, + "rewards//std": 0.03568596765398979, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.295, + "grad_norm": 1.9205505847930908, + "kl": 0.822709271684289, + "learning_rate": 8.093288711103971e-07, + "loss": 0.0823, + "num_tokens": 12749984.0, + "reward": 0.7606201171875, + "reward_std": 0.006251954939216375, + "rewards//mean": 0.7606201171875, + "rewards//std": 0.027904614806175232, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2952, + "grad_norm": 5.270324230194092, + "kl": 1.0679902900010347, + "learning_rate": 8.090794931103026e-07, + "loss": 0.1068, + "num_tokens": 12758592.0, + "reward": 0.75225830078125, + "reward_std": 0.006496988236904144, + "rewards//mean": 0.75225830078125, + "rewards//std": 0.024898696690797806, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2954, + "grad_norm": 4.030917167663574, + "kl": 1.2331567518413067, + "learning_rate": 8.08829990613025e-07, + "loss": 0.1233, + "num_tokens": 12767192.0, + "reward": 0.7545166015625, + "reward_std": 0.013106499798595905, + "rewards//mean": 0.7545166015625, + "rewards//std": 0.03440921753644943, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2956, + "grad_norm": 3.2291336059570312, + "kl": 1.6347051300108433, + "learning_rate": 8.085803637190643e-07, + "loss": 0.1635, + "num_tokens": 12775832.0, + "reward": 0.786865234375, + "reward_std": 0.01683385856449604, + "rewards//mean": 0.786865234375, + "rewards//std": 0.035172607749700546, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2958, + "grad_norm": 5.795406818389893, + "kl": 1.2767824828624725, + "learning_rate": 8.083306125289697e-07, + "loss": 0.1277, + "num_tokens": 12784360.0, + "reward": 0.755859375, + "reward_std": 0.009569985792040825, + "rewards//mean": 0.755859375, + "rewards//std": 0.03326234221458435, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.296, + "grad_norm": 8.213812828063965, + "kl": 1.231348818168044, + "learning_rate": 8.080807371433414e-07, + "loss": 0.1231, + "num_tokens": 12792968.0, + "reward": 0.75750732421875, + "reward_std": 0.010824731551110744, + "rewards//mean": 0.75750732421875, + "rewards//std": 0.027333160862326622, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2962, + "grad_norm": 3.7849719524383545, + "kl": 0.7953167650848627, + "learning_rate": 8.07830737662829e-07, + "loss": 0.0795, + "num_tokens": 12801640.0, + "reward": 0.7484130859375, + "reward_std": 0.004670898430049419, + "rewards//mean": 0.7484130859375, + "rewards//std": 0.025626515969634056, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2964, + "grad_norm": 5.114973068237305, + "kl": 1.5109335407614708, + "learning_rate": 8.075806141881325e-07, + "loss": 0.1511, + "num_tokens": 12810376.0, + "reward": 0.76104736328125, + "reward_std": 0.010529964230954647, + "rewards//mean": 0.76104736328125, + "rewards//std": 0.03358520567417145, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2966, + "grad_norm": 3.346491575241089, + "kl": 1.2512095645070076, + "learning_rate": 8.073303668200011e-07, + "loss": 0.1251, + "num_tokens": 12819056.0, + "reward": 0.77276611328125, + "reward_std": 0.00834638625383377, + "rewards//mean": 0.77276611328125, + "rewards//std": 0.02868431806564331, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2968, + "grad_norm": 11.776288032531738, + "kl": 1.6909576747566462, + "learning_rate": 8.070799956592349e-07, + "loss": 0.1691, + "num_tokens": 12827616.0, + "reward": 0.73974609375, + "reward_std": 0.012325368821620941, + "rewards//mean": 0.73974609375, + "rewards//std": 0.030887499451637268, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.297, + "grad_norm": 3.3797404766082764, + "kl": 0.9280413258820772, + "learning_rate": 8.06829500806683e-07, + "loss": 0.0928, + "num_tokens": 12836248.0, + "reward": 0.76702880859375, + "reward_std": 0.008107181638479233, + "rewards//mean": 0.76702880859375, + "rewards//std": 0.026063675060868263, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2972, + "grad_norm": 6.333701133728027, + "kl": 3.41668620146811, + "learning_rate": 8.06578882363245e-07, + "loss": 0.3417, + "num_tokens": 12844976.0, + "reward": 0.7333984375, + "reward_std": 0.02361653372645378, + "rewards//mean": 0.7333984375, + "rewards//std": 0.03670991212129593, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2974, + "grad_norm": 5.154001235961914, + "kl": 1.3662318922579288, + "learning_rate": 8.063281404298699e-07, + "loss": 0.1366, + "num_tokens": 12853648.0, + "reward": 0.78631591796875, + "reward_std": 0.009496654383838177, + "rewards//mean": 0.78631591796875, + "rewards//std": 0.030818259343504906, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2976, + "grad_norm": 2.8072831630706787, + "kl": 1.3740430641919374, + "learning_rate": 8.060772751075562e-07, + "loss": 0.1374, + "num_tokens": 12862296.0, + "reward": 0.7535400390625, + "reward_std": 0.006750519387423992, + "rewards//mean": 0.7535400390625, + "rewards//std": 0.022859087213873863, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2978, + "grad_norm": 17.512250900268555, + "kl": 1.33380495198071, + "learning_rate": 8.058262864973528e-07, + "loss": 0.1334, + "num_tokens": 12870952.0, + "reward": 0.74346923828125, + "reward_std": 0.008682006038725376, + "rewards//mean": 0.74346923828125, + "rewards//std": 0.03712359815835953, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.298, + "grad_norm": 5.1915717124938965, + "kl": 1.159490229561925, + "learning_rate": 8.055751747003579e-07, + "loss": 0.1159, + "num_tokens": 12879552.0, + "reward": 0.7369384765625, + "reward_std": 0.010565157048404217, + "rewards//mean": 0.7369384765625, + "rewards//std": 0.032288506627082825, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2982, + "grad_norm": 20.961589813232422, + "kl": 2.250743241980672, + "learning_rate": 8.053239398177191e-07, + "loss": 0.2251, + "num_tokens": 12888144.0, + "reward": 0.75775146484375, + "reward_std": 0.01169489324092865, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.03842165693640709, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2984, + "grad_norm": 9.771661758422852, + "kl": 1.4084699284285307, + "learning_rate": 8.050725819506339e-07, + "loss": 0.1408, + "num_tokens": 12896776.0, + "reward": 0.75518798828125, + "reward_std": 0.012158473953604698, + "rewards//mean": 0.75518798828125, + "rewards//std": 0.029676197096705437, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2986, + "grad_norm": 5.195116996765137, + "kl": 2.023480501025915, + "learning_rate": 8.048211012003489e-07, + "loss": 0.2023, + "num_tokens": 12905488.0, + "reward": 0.74920654296875, + "reward_std": 0.014508301392197609, + "rewards//mean": 0.74920654296875, + "rewards//std": 0.026659158989787102, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2988, + "grad_norm": 3.7036025524139404, + "kl": 1.2176567781716585, + "learning_rate": 8.045694976681612e-07, + "loss": 0.1218, + "num_tokens": 12914184.0, + "reward": 0.73602294921875, + "reward_std": 0.010885203257203102, + "rewards//mean": 0.73602294921875, + "rewards//std": 0.03514456748962402, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.299, + "grad_norm": 5.165862560272217, + "kl": 2.218764767050743, + "learning_rate": 8.043177714554159e-07, + "loss": 0.2219, + "num_tokens": 12922832.0, + "reward": 0.76495361328125, + "reward_std": 0.014915489591658115, + "rewards//mean": 0.76495361328125, + "rewards//std": 0.03475475311279297, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2992, + "grad_norm": 5.861644268035889, + "kl": 2.293422434478998, + "learning_rate": 8.04065922663509e-07, + "loss": 0.2293, + "num_tokens": 12931536.0, + "reward": 0.78033447265625, + "reward_std": 0.01676148921251297, + "rewards//mean": 0.78033447265625, + "rewards//std": 0.04310518130660057, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2994, + "grad_norm": 23.607648849487305, + "kl": 3.8990252763032913, + "learning_rate": 8.038139513938845e-07, + "loss": 0.3899, + "num_tokens": 12940248.0, + "reward": 0.733154296875, + "reward_std": 0.011588525958359241, + "rewards//mean": 0.733154296875, + "rewards//std": 0.03267369419336319, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2996, + "grad_norm": 23.689645767211914, + "kl": 3.6418681479990482, + "learning_rate": 8.035618577480369e-07, + "loss": 0.3642, + "num_tokens": 12948888.0, + "reward": 0.7723388671875, + "reward_std": 0.017906051129102707, + "rewards//mean": 0.7723388671875, + "rewards//std": 0.03809865564107895, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.2998, + "grad_norm": 9.937276840209961, + "kl": 2.2664911299943924, + "learning_rate": 8.033096418275092e-07, + "loss": 0.2266, + "num_tokens": 12957584.0, + "reward": 0.72021484375, + "reward_std": 0.011843969114124775, + "rewards//mean": 0.72021484375, + "rewards//std": 0.046201031655073166, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3, + "grad_norm": 7.549541473388672, + "kl": 1.1379733439534903, + "learning_rate": 8.030573037338941e-07, + "loss": 0.1138, + "num_tokens": 12966240.0, + "reward": 0.77044677734375, + "reward_std": 0.01573409140110016, + "rewards//mean": 0.77044677734375, + "rewards//std": 0.02930404432117939, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3002, + "grad_norm": 2.8214757442474365, + "kl": 1.3537630531936884, + "learning_rate": 8.028048435688333e-07, + "loss": 0.1354, + "num_tokens": 12974816.0, + "reward": 0.77447509765625, + "reward_std": 0.00893948134034872, + "rewards//mean": 0.77447509765625, + "rewards//std": 0.03006690926849842, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3004, + "grad_norm": 4.832440376281738, + "kl": 1.951517477631569, + "learning_rate": 8.025522614340177e-07, + "loss": 0.1952, + "num_tokens": 12983344.0, + "reward": 0.7366943359375, + "reward_std": 0.018012333661317825, + "rewards//mean": 0.7366943359375, + "rewards//std": 0.04109729453921318, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3006, + "grad_norm": 6.3301286697387695, + "kl": 2.010582856833935, + "learning_rate": 8.022995574311875e-07, + "loss": 0.2011, + "num_tokens": 12992024.0, + "reward": 0.7222900390625, + "reward_std": 0.010570096783339977, + "rewards//mean": 0.7222900390625, + "rewards//std": 0.03358638659119606, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3008, + "grad_norm": 4.824504852294922, + "kl": 1.5263384692370892, + "learning_rate": 8.020467316621316e-07, + "loss": 0.1526, + "num_tokens": 13000600.0, + "reward": 0.7816162109375, + "reward_std": 0.009848502464592457, + "rewards//mean": 0.7816162109375, + "rewards//std": 0.02930385060608387, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.301, + "grad_norm": 4.907270431518555, + "kl": 1.3549659363925457, + "learning_rate": 8.017937842286882e-07, + "loss": 0.1355, + "num_tokens": 13009272.0, + "reward": 0.75518798828125, + "reward_std": 0.012927262112498283, + "rewards//mean": 0.75518798828125, + "rewards//std": 0.02927355095744133, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3012, + "grad_norm": 2.6145718097686768, + "kl": 1.1812763661146164, + "learning_rate": 8.015407152327447e-07, + "loss": 0.1181, + "num_tokens": 13017872.0, + "reward": 0.74652099609375, + "reward_std": 0.007757539860904217, + "rewards//mean": 0.74652099609375, + "rewards//std": 0.03364689648151398, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3014, + "grad_norm": 3.4602737426757812, + "kl": 1.3582522384822369, + "learning_rate": 8.012875247762372e-07, + "loss": 0.1358, + "num_tokens": 13026464.0, + "reward": 0.7747802734375, + "reward_std": 0.009007740765810013, + "rewards//mean": 0.7747802734375, + "rewards//std": 0.02747381664812565, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3016, + "grad_norm": 2.8289504051208496, + "kl": 1.087059661746025, + "learning_rate": 8.010342129611507e-07, + "loss": 0.1087, + "num_tokens": 13035080.0, + "reward": 0.74578857421875, + "reward_std": 0.0074275191873312, + "rewards//mean": 0.74578857421875, + "rewards//std": 0.027737779542803764, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3018, + "grad_norm": 5.269764423370361, + "kl": 1.1959709245711565, + "learning_rate": 8.007807798895193e-07, + "loss": 0.1196, + "num_tokens": 13043824.0, + "reward": 0.74530029296875, + "reward_std": 0.012596560642123222, + "rewards//mean": 0.74530029296875, + "rewards//std": 0.03109600394964218, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.302, + "grad_norm": 2.1235713958740234, + "kl": 1.288216095417738, + "learning_rate": 8.005272256634257e-07, + "loss": 0.1288, + "num_tokens": 13052464.0, + "reward": 0.7431640625, + "reward_std": 0.0078422911465168, + "rewards//mean": 0.7431640625, + "rewards//std": 0.03133514150977135, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3022, + "grad_norm": 2.6874098777770996, + "kl": 1.1431039813905954, + "learning_rate": 8.002735503850015e-07, + "loss": 0.1143, + "num_tokens": 13061144.0, + "reward": 0.7708740234375, + "reward_std": 0.008286407217383385, + "rewards//mean": 0.7708740234375, + "rewards//std": 0.029158851131796837, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3024, + "grad_norm": 4.7610697746276855, + "kl": 1.9959817864000797, + "learning_rate": 8.000197541564271e-07, + "loss": 0.1996, + "num_tokens": 13069808.0, + "reward": 0.73052978515625, + "reward_std": 0.01354941911995411, + "rewards//mean": 0.73052978515625, + "rewards//std": 0.038079727441072464, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3026, + "grad_norm": 8.037347793579102, + "kl": 1.512846115976572, + "learning_rate": 7.997658370799316e-07, + "loss": 0.1513, + "num_tokens": 13078400.0, + "reward": 0.761474609375, + "reward_std": 0.010340271517634392, + "rewards//mean": 0.761474609375, + "rewards//std": 0.027751486748456955, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3028, + "grad_norm": 4.4248046875, + "kl": 1.079721711575985, + "learning_rate": 7.995117992577928e-07, + "loss": 0.108, + "num_tokens": 13086976.0, + "reward": 0.7510986328125, + "reward_std": 0.008711807429790497, + "rewards//mean": 0.7510986328125, + "rewards//std": 0.023862190544605255, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.303, + "grad_norm": 2.05684232711792, + "kl": 0.7509970609098673, + "learning_rate": 7.992576407923372e-07, + "loss": 0.0751, + "num_tokens": 13095616.0, + "reward": 0.7593994140625, + "reward_std": 0.005750302225351334, + "rewards//mean": 0.7593994140625, + "rewards//std": 0.029113130643963814, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3032, + "grad_norm": 5.213685035705566, + "kl": 1.4956692047417164, + "learning_rate": 7.990033617859395e-07, + "loss": 0.1496, + "num_tokens": 13104176.0, + "reward": 0.7369384765625, + "reward_std": 0.008287956938147545, + "rewards//mean": 0.7369384765625, + "rewards//std": 0.026238271966576576, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3034, + "grad_norm": 3.9885752201080322, + "kl": 1.2843257393687963, + "learning_rate": 7.987489623410235e-07, + "loss": 0.1284, + "num_tokens": 13112776.0, + "reward": 0.74688720703125, + "reward_std": 0.00606179004535079, + "rewards//mean": 0.74688720703125, + "rewards//std": 0.03626636788249016, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3036, + "grad_norm": 4.215937614440918, + "kl": 1.8588938284665346, + "learning_rate": 7.984944425600613e-07, + "loss": 0.1859, + "num_tokens": 13121368.0, + "reward": 0.76422119140625, + "reward_std": 0.016526661813259125, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.038420867174863815, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3038, + "grad_norm": 3.2377138137817383, + "kl": 1.2811475973576307, + "learning_rate": 7.982398025455732e-07, + "loss": 0.1281, + "num_tokens": 13130136.0, + "reward": 0.7803955078125, + "reward_std": 0.00979221984744072, + "rewards//mean": 0.7803955078125, + "rewards//std": 0.02574673853814602, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.304, + "grad_norm": 4.779201507568359, + "kl": 1.1627158615738153, + "learning_rate": 7.979850424001282e-07, + "loss": 0.1163, + "num_tokens": 13138768.0, + "reward": 0.76025390625, + "reward_std": 0.008882921189069748, + "rewards//mean": 0.76025390625, + "rewards//std": 0.026607505977153778, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3042, + "grad_norm": 1.3904224634170532, + "kl": 0.8005918823182583, + "learning_rate": 7.97730162226344e-07, + "loss": 0.0801, + "num_tokens": 13147480.0, + "reward": 0.74847412109375, + "reward_std": 0.004718668758869171, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.029835429042577744, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3044, + "grad_norm": 5.867562770843506, + "kl": 1.6294219363480806, + "learning_rate": 7.974751621268858e-07, + "loss": 0.1629, + "num_tokens": 13156176.0, + "reward": 0.7520751953125, + "reward_std": 0.009769590571522713, + "rewards//mean": 0.7520751953125, + "rewards//std": 0.03724679350852966, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3046, + "grad_norm": 18.40738868713379, + "kl": 1.7692720592021942, + "learning_rate": 7.972200422044682e-07, + "loss": 0.1769, + "num_tokens": 13164800.0, + "reward": 0.74969482421875, + "reward_std": 0.016118552535772324, + "rewards//mean": 0.74969482421875, + "rewards//std": 0.03148400038480759, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3048, + "grad_norm": 13.267224311828613, + "kl": 0.8517594467848539, + "learning_rate": 7.969648025618529e-07, + "loss": 0.0852, + "num_tokens": 13173432.0, + "reward": 0.7694091796875, + "reward_std": 0.0037979367189109325, + "rewards//mean": 0.7694091796875, + "rewards//std": 0.028133686631917953, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.305, + "grad_norm": 11.241990089416504, + "kl": 1.5441111326217651, + "learning_rate": 7.967094433018508e-07, + "loss": 0.1544, + "num_tokens": 13182008.0, + "reward": 0.760498046875, + "reward_std": 0.012233583256602287, + "rewards//mean": 0.760498046875, + "rewards//std": 0.023917002603411674, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3052, + "grad_norm": 4.886778831481934, + "kl": 0.9253472704440355, + "learning_rate": 7.964539645273202e-07, + "loss": 0.0925, + "num_tokens": 13190560.0, + "reward": 0.776123046875, + "reward_std": 0.008092978969216347, + "rewards//mean": 0.776123046875, + "rewards//std": 0.0288217943161726, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3054, + "grad_norm": 6.10654878616333, + "kl": 1.8288121819496155, + "learning_rate": 7.961983663411684e-07, + "loss": 0.1829, + "num_tokens": 13199200.0, + "reward": 0.75701904296875, + "reward_std": 0.011038584634661674, + "rewards//mean": 0.75701904296875, + "rewards//std": 0.03996465727686882, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3056, + "grad_norm": 21.48021125793457, + "kl": 2.005971573293209, + "learning_rate": 7.959426488463499e-07, + "loss": 0.2006, + "num_tokens": 13207840.0, + "reward": 0.739990234375, + "reward_std": 0.012318646535277367, + "rewards//mean": 0.739990234375, + "rewards//std": 0.028055289760231972, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3058, + "grad_norm": 14.893638610839844, + "kl": 2.1409136690199375, + "learning_rate": 7.956868121458677e-07, + "loss": 0.2141, + "num_tokens": 13216408.0, + "reward": 0.73272705078125, + "reward_std": 0.00900019146502018, + "rewards//mean": 0.73272705078125, + "rewards//std": 0.0260398518294096, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.306, + "grad_norm": 15.753615379333496, + "kl": 2.612980095669627, + "learning_rate": 7.954308563427732e-07, + "loss": 0.2613, + "num_tokens": 13225024.0, + "reward": 0.74664306640625, + "reward_std": 0.014729749411344528, + "rewards//mean": 0.74664306640625, + "rewards//std": 0.03410980850458145, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3062, + "grad_norm": 6.290655612945557, + "kl": 1.0625749547034502, + "learning_rate": 7.951747815401649e-07, + "loss": 0.1063, + "num_tokens": 13233720.0, + "reward": 0.7789306640625, + "reward_std": 0.005719395820051432, + "rewards//mean": 0.7789306640625, + "rewards//std": 0.03233722969889641, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3064, + "grad_norm": 3.4960575103759766, + "kl": 0.9276208523660898, + "learning_rate": 7.949185878411899e-07, + "loss": 0.0928, + "num_tokens": 13242352.0, + "reward": 0.78204345703125, + "reward_std": 0.010589463636279106, + "rewards//mean": 0.78204345703125, + "rewards//std": 0.026722678914666176, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3066, + "grad_norm": 4.718594551086426, + "kl": 0.8793540969491005, + "learning_rate": 7.946622753490432e-07, + "loss": 0.0879, + "num_tokens": 13250976.0, + "reward": 0.7777099609375, + "reward_std": 0.009698061272501945, + "rewards//mean": 0.7777099609375, + "rewards//std": 0.029811890795826912, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3068, + "grad_norm": 3.5431320667266846, + "kl": 1.7327657788991928, + "learning_rate": 7.94405844166967e-07, + "loss": 0.1733, + "num_tokens": 13259616.0, + "reward": 0.7159423828125, + "reward_std": 0.012289916165173054, + "rewards//mean": 0.7159423828125, + "rewards//std": 0.04350076988339424, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.307, + "grad_norm": 8.551236152648926, + "kl": 2.487602587789297, + "learning_rate": 7.941492943982521e-07, + "loss": 0.2488, + "num_tokens": 13268344.0, + "reward": 0.7349853515625, + "reward_std": 0.0126652205362916, + "rewards//mean": 0.7349853515625, + "rewards//std": 0.044675033539533615, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3072, + "grad_norm": 4.800576686859131, + "kl": 1.3567574676126242, + "learning_rate": 7.938926261462365e-07, + "loss": 0.1357, + "num_tokens": 13276984.0, + "reward": 0.73065185546875, + "reward_std": 0.009367650374770164, + "rewards//mean": 0.73065185546875, + "rewards//std": 0.027281051501631737, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3074, + "grad_norm": 2.5340986251831055, + "kl": 1.4362577367573977, + "learning_rate": 7.936358395143063e-07, + "loss": 0.1436, + "num_tokens": 13285624.0, + "reward": 0.7435302734375, + "reward_std": 0.007179467007517815, + "rewards//mean": 0.7435302734375, + "rewards//std": 0.024351980537176132, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3076, + "grad_norm": 4.1041789054870605, + "kl": 1.6592697482556105, + "learning_rate": 7.93378934605895e-07, + "loss": 0.1659, + "num_tokens": 13294312.0, + "reward": 0.75531005859375, + "reward_std": 0.014915602281689644, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.030870771035552025, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3078, + "grad_norm": 2.6387155055999756, + "kl": 0.9437056761234999, + "learning_rate": 7.93121911524484e-07, + "loss": 0.0944, + "num_tokens": 13302888.0, + "reward": 0.7305908203125, + "reward_std": 0.005320197436958551, + "rewards//mean": 0.7305908203125, + "rewards//std": 0.031148837879300117, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.308, + "grad_norm": 2.047175645828247, + "kl": 0.7891005948185921, + "learning_rate": 7.928647703736023e-07, + "loss": 0.0789, + "num_tokens": 13311616.0, + "reward": 0.7376708984375, + "reward_std": 0.0020976788364350796, + "rewards//mean": 0.7376708984375, + "rewards//std": 0.029250076040625572, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3082, + "grad_norm": 6.142187118530273, + "kl": 0.8477199338376522, + "learning_rate": 7.926075112568258e-07, + "loss": 0.0848, + "num_tokens": 13320200.0, + "reward": 0.759033203125, + "reward_std": 0.012320725247263908, + "rewards//mean": 0.759033203125, + "rewards//std": 0.034074340015649796, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3084, + "grad_norm": 3.6553497314453125, + "kl": 0.8029363844543695, + "learning_rate": 7.923501342777787e-07, + "loss": 0.0803, + "num_tokens": 13328760.0, + "reward": 0.73828125, + "reward_std": 0.007218477316200733, + "rewards//mean": 0.73828125, + "rewards//std": 0.03479268029332161, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3086, + "grad_norm": 6.512855529785156, + "kl": 0.8737721461802721, + "learning_rate": 7.920926395401326e-07, + "loss": 0.0874, + "num_tokens": 13337336.0, + "reward": 0.780029296875, + "reward_std": 0.007003386504948139, + "rewards//mean": 0.780029296875, + "rewards//std": 0.02661092020571232, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3088, + "grad_norm": 4.729723930358887, + "kl": 2.118842177093029, + "learning_rate": 7.918350271476063e-07, + "loss": 0.2119, + "num_tokens": 13346008.0, + "reward": 0.7794189453125, + "reward_std": 0.010929237119853497, + "rewards//mean": 0.7794189453125, + "rewards//std": 0.030016331002116203, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.309, + "grad_norm": 2.64178466796875, + "kl": 1.4418923668563366, + "learning_rate": 7.915772972039659e-07, + "loss": 0.1442, + "num_tokens": 13354600.0, + "reward": 0.75494384765625, + "reward_std": 0.006001880392432213, + "rewards//mean": 0.75494384765625, + "rewards//std": 0.03073612041771412, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3092, + "grad_norm": 5.51864767074585, + "kl": 1.497476452961564, + "learning_rate": 7.913194498130251e-07, + "loss": 0.1497, + "num_tokens": 13363288.0, + "reward": 0.774658203125, + "reward_std": 0.010132655501365662, + "rewards//mean": 0.774658203125, + "rewards//std": 0.035027701407670975, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3094, + "grad_norm": 3.6369011402130127, + "kl": 1.2751891389489174, + "learning_rate": 7.910614850786447e-07, + "loss": 0.1275, + "num_tokens": 13371904.0, + "reward": 0.748046875, + "reward_std": 0.009027308784425259, + "rewards//mean": 0.748046875, + "rewards//std": 0.02498798444867134, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3096, + "grad_norm": 1.8712557554244995, + "kl": 1.1147124227136374, + "learning_rate": 7.90803403104733e-07, + "loss": 0.1115, + "num_tokens": 13380552.0, + "reward": 0.76202392578125, + "reward_std": 0.0050063710659742355, + "rewards//mean": 0.76202392578125, + "rewards//std": 0.02744811400771141, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3098, + "grad_norm": 3.3328089714050293, + "kl": 1.0839637350291014, + "learning_rate": 7.905452039952451e-07, + "loss": 0.1084, + "num_tokens": 13389160.0, + "reward": 0.8018798828125, + "reward_std": 0.011048915795981884, + "rewards//mean": 0.8018798828125, + "rewards//std": 0.02959785796701908, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.31, + "grad_norm": 11.956218719482422, + "kl": 2.716661686077714, + "learning_rate": 7.90286887854184e-07, + "loss": 0.2717, + "num_tokens": 13397784.0, + "reward": 0.737060546875, + "reward_std": 0.010013382881879807, + "rewards//mean": 0.737060546875, + "rewards//std": 0.03005589358508587, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3102, + "grad_norm": 2.4767353534698486, + "kl": 1.0682494826614857, + "learning_rate": 7.900284547855991e-07, + "loss": 0.1068, + "num_tokens": 13406464.0, + "reward": 0.7271728515625, + "reward_std": 0.008038777858018875, + "rewards//mean": 0.7271728515625, + "rewards//std": 0.036112699657678604, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3104, + "grad_norm": 3.2068850994110107, + "kl": 1.5400317627936602, + "learning_rate": 7.897699048935873e-07, + "loss": 0.154, + "num_tokens": 13415112.0, + "reward": 0.753662109375, + "reward_std": 0.01680159568786621, + "rewards//mean": 0.753662109375, + "rewards//std": 0.035962529480457306, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3106, + "grad_norm": 3.355475425720215, + "kl": 1.2242168709635735, + "learning_rate": 7.895112382822924e-07, + "loss": 0.1224, + "num_tokens": 13423688.0, + "reward": 0.77093505859375, + "reward_std": 0.01482023298740387, + "rewards//mean": 0.77093505859375, + "rewards//std": 0.03478827700018883, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3108, + "grad_norm": 2.225855827331543, + "kl": 1.0106054935604334, + "learning_rate": 7.892524550559055e-07, + "loss": 0.1011, + "num_tokens": 13432320.0, + "reward": 0.75189208984375, + "reward_std": 0.008050731383264065, + "rewards//mean": 0.75189208984375, + "rewards//std": 0.022638414055109024, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.311, + "grad_norm": 3.592600107192993, + "kl": 0.8617877624928951, + "learning_rate": 7.889935553186641e-07, + "loss": 0.0862, + "num_tokens": 13440992.0, + "reward": 0.722412109375, + "reward_std": 0.0037979367189109325, + "rewards//mean": 0.722412109375, + "rewards//std": 0.03946883976459503, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3112, + "grad_norm": 6.422780990600586, + "kl": 2.3515776824206114, + "learning_rate": 7.887345391748532e-07, + "loss": 0.2352, + "num_tokens": 13449704.0, + "reward": 0.7469482421875, + "reward_std": 0.014347494579851627, + "rewards//mean": 0.7469482421875, + "rewards//std": 0.0360405296087265, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3114, + "grad_norm": 2.1762797832489014, + "kl": 1.141447415575385, + "learning_rate": 7.884754067288046e-07, + "loss": 0.1141, + "num_tokens": 13458200.0, + "reward": 0.74432373046875, + "reward_std": 0.007982091046869755, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.02768205665051937, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3116, + "grad_norm": 1.4633209705352783, + "kl": 1.4486240819096565, + "learning_rate": 7.882161580848966e-07, + "loss": 0.1449, + "num_tokens": 13466832.0, + "reward": 0.77484130859375, + "reward_std": 0.009828522801399231, + "rewards//mean": 0.77484130859375, + "rewards//std": 0.03169150650501251, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3118, + "grad_norm": 7.229136943817139, + "kl": 1.8612057957798243, + "learning_rate": 7.879567933475546e-07, + "loss": 0.1861, + "num_tokens": 13475464.0, + "reward": 0.78656005859375, + "reward_std": 0.0042387619614601135, + "rewards//mean": 0.78656005859375, + "rewards//std": 0.026533372700214386, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.312, + "grad_norm": 1.9912046194076538, + "kl": 1.2802764270454645, + "learning_rate": 7.876973126212506e-07, + "loss": 0.128, + "num_tokens": 13484048.0, + "reward": 0.75384521484375, + "reward_std": 0.008768283762037754, + "rewards//mean": 0.75384521484375, + "rewards//std": 0.028709111735224724, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3122, + "grad_norm": 2.2037763595581055, + "kl": 2.05273468978703, + "learning_rate": 7.874377160105036e-07, + "loss": 0.2053, + "num_tokens": 13492744.0, + "reward": 0.74334716796875, + "reward_std": 0.014118552207946777, + "rewards//mean": 0.74334716796875, + "rewards//std": 0.035445649176836014, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3124, + "grad_norm": 2.6772477626800537, + "kl": 1.9671489968895912, + "learning_rate": 7.871780036198788e-07, + "loss": 0.1967, + "num_tokens": 13501400.0, + "reward": 0.71099853515625, + "reward_std": 0.009235531091690063, + "rewards//mean": 0.71099853515625, + "rewards//std": 0.03892269358038902, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3126, + "grad_norm": 1.6730657815933228, + "kl": 0.8839522656053305, + "learning_rate": 7.869181755539887e-07, + "loss": 0.0884, + "num_tokens": 13509960.0, + "reward": 0.776611328125, + "reward_std": 0.004584170877933502, + "rewards//mean": 0.776611328125, + "rewards//std": 0.021775512024760246, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3128, + "grad_norm": 1.6954586505889893, + "kl": 1.7431340981274843, + "learning_rate": 7.866582319174917e-07, + "loss": 0.1743, + "num_tokens": 13518672.0, + "reward": 0.75811767578125, + "reward_std": 0.01100950874388218, + "rewards//mean": 0.75811767578125, + "rewards//std": 0.03252829983830452, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.313, + "grad_norm": 5.556436538696289, + "kl": 2.1147956494241953, + "learning_rate": 7.863981728150931e-07, + "loss": 0.2115, + "num_tokens": 13527360.0, + "reward": 0.75927734375, + "reward_std": 0.01640233024954796, + "rewards//mean": 0.75927734375, + "rewards//std": 0.04094248265028, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3132, + "grad_norm": 5.08538293838501, + "kl": 2.2543717678636312, + "learning_rate": 7.861379983515448e-07, + "loss": 0.2254, + "num_tokens": 13536168.0, + "reward": 0.74151611328125, + "reward_std": 0.011906067840754986, + "rewards//mean": 0.74151611328125, + "rewards//std": 0.026617106050252914, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3134, + "grad_norm": 4.156130790710449, + "kl": 1.999925158917904, + "learning_rate": 7.858777086316451e-07, + "loss": 0.2, + "num_tokens": 13544952.0, + "reward": 0.7574462890625, + "reward_std": 0.01224803738296032, + "rewards//mean": 0.7574462890625, + "rewards//std": 0.02960808575153351, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3136, + "grad_norm": 1.113006591796875, + "kl": 1.2617440987378359, + "learning_rate": 7.856173037602382e-07, + "loss": 0.1262, + "num_tokens": 13553632.0, + "reward": 0.7415771484375, + "reward_std": 0.006727217696607113, + "rewards//mean": 0.7415771484375, + "rewards//std": 0.03345995023846626, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3138, + "grad_norm": 2.3450334072113037, + "kl": 1.4017070643603802, + "learning_rate": 7.853567838422159e-07, + "loss": 0.1402, + "num_tokens": 13562288.0, + "reward": 0.7579345703125, + "reward_std": 0.006472185719758272, + "rewards//mean": 0.7579345703125, + "rewards//std": 0.025807810947299004, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.314, + "grad_norm": 3.7492599487304688, + "kl": 1.2129064369946718, + "learning_rate": 7.850961489825149e-07, + "loss": 0.1213, + "num_tokens": 13570992.0, + "reward": 0.71734619140625, + "reward_std": 0.005895301233977079, + "rewards//mean": 0.71734619140625, + "rewards//std": 0.03116748295724392, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3142, + "grad_norm": 3.8201379776000977, + "kl": 1.356517480686307, + "learning_rate": 7.848353992861194e-07, + "loss": 0.1357, + "num_tokens": 13579696.0, + "reward": 0.7706298828125, + "reward_std": 0.007019908633083105, + "rewards//mean": 0.7706298828125, + "rewards//std": 0.024104561656713486, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3144, + "grad_norm": 6.372725963592529, + "kl": 1.8210402820259333, + "learning_rate": 7.84574534858059e-07, + "loss": 0.1821, + "num_tokens": 13588248.0, + "reward": 0.78668212890625, + "reward_std": 0.011142443865537643, + "rewards//mean": 0.78668212890625, + "rewards//std": 0.028173675760626793, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3146, + "grad_norm": 2.6758177280426025, + "kl": 1.2292321268469095, + "learning_rate": 7.8431355580341e-07, + "loss": 0.1229, + "num_tokens": 13596944.0, + "reward": 0.780517578125, + "reward_std": 0.00958950724452734, + "rewards//mean": 0.780517578125, + "rewards//std": 0.02704427018761635, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3148, + "grad_norm": 2.7407374382019043, + "kl": 1.4006772879511118, + "learning_rate": 7.840524622272948e-07, + "loss": 0.1401, + "num_tokens": 13605600.0, + "reward": 0.7579345703125, + "reward_std": 0.010624115355312824, + "rewards//mean": 0.7579345703125, + "rewards//std": 0.02942139469087124, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.315, + "grad_norm": 1.3578262329101562, + "kl": 1.2945702727884054, + "learning_rate": 7.837912542348817e-07, + "loss": 0.1295, + "num_tokens": 13614224.0, + "reward": 0.787841796875, + "reward_std": 0.009543722495436668, + "rewards//mean": 0.787841796875, + "rewards//std": 0.02747078612446785, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3152, + "grad_norm": 1.3978792428970337, + "kl": 0.59761449880898, + "learning_rate": 7.835299319313853e-07, + "loss": 0.0598, + "num_tokens": 13622760.0, + "reward": 0.78985595703125, + "reward_std": 0.0037059392780065536, + "rewards//mean": 0.78985595703125, + "rewards//std": 0.020566320046782494, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3154, + "grad_norm": 4.199551582336426, + "kl": 1.2743331119418144, + "learning_rate": 7.832684954220663e-07, + "loss": 0.1274, + "num_tokens": 13631560.0, + "reward": 0.7708740234375, + "reward_std": 0.005090552382171154, + "rewards//mean": 0.7708740234375, + "rewards//std": 0.0252983458340168, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3156, + "grad_norm": 1.912658929824829, + "kl": 0.8259076457470655, + "learning_rate": 7.830069448122312e-07, + "loss": 0.0826, + "num_tokens": 13640152.0, + "reward": 0.77105712890625, + "reward_std": 0.005859190598130226, + "rewards//mean": 0.77105712890625, + "rewards//std": 0.026771916076540947, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3158, + "grad_norm": 4.865224838256836, + "kl": 1.5082801021635532, + "learning_rate": 7.827452802072327e-07, + "loss": 0.1508, + "num_tokens": 13648872.0, + "reward": 0.7486572265625, + "reward_std": 0.01078061480075121, + "rewards//mean": 0.7486572265625, + "rewards//std": 0.031041739508509636, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.316, + "grad_norm": 2.9601898193359375, + "kl": 1.4153399653732777, + "learning_rate": 7.82483501712469e-07, + "loss": 0.1415, + "num_tokens": 13657488.0, + "reward": 0.7210693359375, + "reward_std": 0.007203842978924513, + "rewards//mean": 0.7210693359375, + "rewards//std": 0.03267994895577431, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3162, + "grad_norm": 2.042904853820801, + "kl": 1.1619634237140417, + "learning_rate": 7.822216094333847e-07, + "loss": 0.1162, + "num_tokens": 13666072.0, + "reward": 0.78118896484375, + "reward_std": 0.005377811845391989, + "rewards//mean": 0.78118896484375, + "rewards//std": 0.024463849142193794, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3164, + "grad_norm": 3.8128445148468018, + "kl": 2.1430839393287897, + "learning_rate": 7.819596034754696e-07, + "loss": 0.2143, + "num_tokens": 13674672.0, + "reward": 0.772216796875, + "reward_std": 0.020793776959180832, + "rewards//mean": 0.772216796875, + "rewards//std": 0.03858760744333267, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3166, + "grad_norm": 4.484536170959473, + "kl": 1.7213558480143547, + "learning_rate": 7.816974839442603e-07, + "loss": 0.1721, + "num_tokens": 13683352.0, + "reward": 0.75592041015625, + "reward_std": 0.009616399183869362, + "rewards//mean": 0.75592041015625, + "rewards//std": 0.035207830369472504, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3168, + "grad_norm": 3.307380437850952, + "kl": 0.9095842018723488, + "learning_rate": 7.814352509453379e-07, + "loss": 0.091, + "num_tokens": 13691944.0, + "reward": 0.76568603515625, + "reward_std": 0.006881332024931908, + "rewards//mean": 0.76568603515625, + "rewards//std": 0.03095010668039322, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.317, + "grad_norm": 0.8180329203605652, + "kl": 0.8523822519928217, + "learning_rate": 7.811729045843301e-07, + "loss": 0.0852, + "num_tokens": 13700544.0, + "reward": 0.752197265625, + "reward_std": 0.005305876489728689, + "rewards//mean": 0.752197265625, + "rewards//std": 0.03477788344025612, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3172, + "grad_norm": 1.9212833642959595, + "kl": 0.7617413811385632, + "learning_rate": 7.8091044496691e-07, + "loss": 0.0762, + "num_tokens": 13709176.0, + "reward": 0.74871826171875, + "reward_std": 0.004841688089072704, + "rewards//mean": 0.74871826171875, + "rewards//std": 0.03266019746661186, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3174, + "grad_norm": 13.169044494628906, + "kl": 2.1142956260591745, + "learning_rate": 7.806478721987963e-07, + "loss": 0.2114, + "num_tokens": 13717928.0, + "reward": 0.75714111328125, + "reward_std": 0.007307147607207298, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.03143203258514404, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3176, + "grad_norm": 2.6408851146698, + "kl": 1.6814204212278128, + "learning_rate": 7.803851863857532e-07, + "loss": 0.1681, + "num_tokens": 13726672.0, + "reward": 0.7545166015625, + "reward_std": 0.014688249677419662, + "rewards//mean": 0.7545166015625, + "rewards//std": 0.03536039963364601, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3178, + "grad_norm": 4.11833381652832, + "kl": 0.7404515333473682, + "learning_rate": 7.801223876335907e-07, + "loss": 0.074, + "num_tokens": 13735360.0, + "reward": 0.75146484375, + "reward_std": 0.004385187290608883, + "rewards//mean": 0.75146484375, + "rewards//std": 0.022146357223391533, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.318, + "grad_norm": 3.3639450073242188, + "kl": 1.2758905328810215, + "learning_rate": 7.798594760481637e-07, + "loss": 0.1276, + "num_tokens": 13743944.0, + "reward": 0.744384765625, + "reward_std": 0.01257241703569889, + "rewards//mean": 0.744384765625, + "rewards//std": 0.03544698655605316, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3182, + "grad_norm": 2.480842351913452, + "kl": 0.7207079511135817, + "learning_rate": 7.795964517353733e-07, + "loss": 0.0721, + "num_tokens": 13752664.0, + "reward": 0.75469970703125, + "reward_std": 0.005162329412996769, + "rewards//mean": 0.75469970703125, + "rewards//std": 0.02229546196758747, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3184, + "grad_norm": 2.6617345809936523, + "kl": 0.6747199799865484, + "learning_rate": 7.793333148011657e-07, + "loss": 0.0675, + "num_tokens": 13761248.0, + "reward": 0.75189208984375, + "reward_std": 0.003001078264787793, + "rewards//mean": 0.75189208984375, + "rewards//std": 0.03142769634723663, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3186, + "grad_norm": 3.590163230895996, + "kl": 1.6595496125519276, + "learning_rate": 7.790700653515323e-07, + "loss": 0.166, + "num_tokens": 13769792.0, + "reward": 0.70941162109375, + "reward_std": 0.018709056079387665, + "rewards//mean": 0.70941162109375, + "rewards//std": 0.047458428889513016, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3188, + "grad_norm": 1.8195044994354248, + "kl": 0.7813002169132233, + "learning_rate": 7.788067034925099e-07, + "loss": 0.0781, + "num_tokens": 13778368.0, + "reward": 0.76263427734375, + "reward_std": 0.004283786751329899, + "rewards//mean": 0.76263427734375, + "rewards//std": 0.024956386536359787, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.319, + "grad_norm": 1.3556106090545654, + "kl": 0.5726229008287191, + "learning_rate": 7.785432293301806e-07, + "loss": 0.0573, + "num_tokens": 13786984.0, + "reward": 0.7423095703125, + "reward_std": 0.0024699352215975523, + "rewards//mean": 0.7423095703125, + "rewards//std": 0.029601948335766792, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3192, + "grad_norm": 2.3305325508117676, + "kl": 1.0940890200436115, + "learning_rate": 7.78279642970672e-07, + "loss": 0.1094, + "num_tokens": 13795544.0, + "reward": 0.766357421875, + "reward_std": 0.007712875958532095, + "rewards//mean": 0.766357421875, + "rewards//std": 0.028321649879217148, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3194, + "grad_norm": 2.3872010707855225, + "kl": 1.9481038190424442, + "learning_rate": 7.780159445201562e-07, + "loss": 0.1948, + "num_tokens": 13804232.0, + "reward": 0.7818603515625, + "reward_std": 0.013373659923672676, + "rewards//mean": 0.7818603515625, + "rewards//std": 0.029406985267996788, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3196, + "grad_norm": 5.151343822479248, + "kl": 1.7479426227509975, + "learning_rate": 7.777521340848514e-07, + "loss": 0.1748, + "num_tokens": 13812880.0, + "reward": 0.70086669921875, + "reward_std": 0.010062005370855331, + "rewards//mean": 0.70086669921875, + "rewards//std": 0.03129495680332184, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3198, + "grad_norm": 5.408864498138428, + "kl": 1.326032130047679, + "learning_rate": 7.774882117710202e-07, + "loss": 0.1326, + "num_tokens": 13821464.0, + "reward": 0.74822998046875, + "reward_std": 0.00762249156832695, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.029031576588749886, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.32, + "grad_norm": 8.4829740524292, + "kl": 1.9540950190275908, + "learning_rate": 7.772241776849704e-07, + "loss": 0.1954, + "num_tokens": 13830008.0, + "reward": 0.69781494140625, + "reward_std": 0.0062002502381801605, + "rewards//mean": 0.69781494140625, + "rewards//std": 0.01987382210791111, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3202, + "grad_norm": 4.335939884185791, + "kl": 1.7251414489001036, + "learning_rate": 7.769600319330552e-07, + "loss": 0.1725, + "num_tokens": 13838680.0, + "reward": 0.772216796875, + "reward_std": 0.013628230430185795, + "rewards//mean": 0.772216796875, + "rewards//std": 0.034840505570173264, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3204, + "grad_norm": 3.052686929702759, + "kl": 1.5815194714814425, + "learning_rate": 7.76695774621672e-07, + "loss": 0.1582, + "num_tokens": 13847352.0, + "reward": 0.751220703125, + "reward_std": 0.00982726737856865, + "rewards//mean": 0.751220703125, + "rewards//std": 0.02717827446758747, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3206, + "grad_norm": 5.098186492919922, + "kl": 1.0441518649458885, + "learning_rate": 7.764314058572639e-07, + "loss": 0.1044, + "num_tokens": 13855992.0, + "reward": 0.75897216796875, + "reward_std": 0.006328641436994076, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.022549305111169815, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3208, + "grad_norm": 2.5886666774749756, + "kl": 1.1139055788516998, + "learning_rate": 7.761669257463187e-07, + "loss": 0.1114, + "num_tokens": 13864784.0, + "reward": 0.77484130859375, + "reward_std": 0.007363666780292988, + "rewards//mean": 0.77484130859375, + "rewards//std": 0.016810311004519463, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.321, + "grad_norm": 2.864546298980713, + "kl": 1.4520734641700983, + "learning_rate": 7.759023343953688e-07, + "loss": 0.1452, + "num_tokens": 13873328.0, + "reward": 0.7550048828125, + "reward_std": 0.00650212075561285, + "rewards//mean": 0.7550048828125, + "rewards//std": 0.02677057310938835, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3212, + "grad_norm": 5.175642490386963, + "kl": 1.1859997715801, + "learning_rate": 7.756376319109916e-07, + "loss": 0.1186, + "num_tokens": 13882000.0, + "reward": 0.744384765625, + "reward_std": 0.009795076213777065, + "rewards//mean": 0.744384765625, + "rewards//std": 0.02811565436422825, + "step": 1606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3214, + "grad_norm": 2.274979591369629, + "kl": 1.1782709881663322, + "learning_rate": 7.753728183998092e-07, + "loss": 0.1178, + "num_tokens": 13890640.0, + "reward": 0.752685546875, + "reward_std": 0.009832650423049927, + "rewards//mean": 0.752685546875, + "rewards//std": 0.03325415030121803, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3216, + "grad_norm": 6.960323810577393, + "kl": 2.113839268684387, + "learning_rate": 7.751078939684885e-07, + "loss": 0.2114, + "num_tokens": 13899208.0, + "reward": 0.75244140625, + "reward_std": 0.01238178089261055, + "rewards//mean": 0.75244140625, + "rewards//std": 0.03176885098218918, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3218, + "grad_norm": 3.728010892868042, + "kl": 0.8760987985879183, + "learning_rate": 7.748428587237411e-07, + "loss": 0.0876, + "num_tokens": 13907808.0, + "reward": 0.75177001953125, + "reward_std": 0.00822415016591549, + "rewards//mean": 0.75177001953125, + "rewards//std": 0.02808164805173874, + "step": 1609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.322, + "grad_norm": 3.4108834266662598, + "kl": 1.701963922008872, + "learning_rate": 7.74577712772323e-07, + "loss": 0.1702, + "num_tokens": 13916416.0, + "reward": 0.75103759765625, + "reward_std": 0.00944104976952076, + "rewards//mean": 0.75103759765625, + "rewards//std": 0.017023282125592232, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3222, + "grad_norm": 1.2875810861587524, + "kl": 1.2748398445546627, + "learning_rate": 7.743124562210351e-07, + "loss": 0.1275, + "num_tokens": 13925032.0, + "reward": 0.71661376953125, + "reward_std": 0.0060372017323970795, + "rewards//mean": 0.71661376953125, + "rewards//std": 0.025228464975953102, + "step": 1611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3224, + "grad_norm": 5.368699550628662, + "kl": 1.4100149907171726, + "learning_rate": 7.740470891767224e-07, + "loss": 0.141, + "num_tokens": 13933616.0, + "reward": 0.74786376953125, + "reward_std": 0.00679908599704504, + "rewards//mean": 0.74786376953125, + "rewards//std": 0.02963433973491192, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3226, + "grad_norm": 4.926486492156982, + "kl": 1.964484740048647, + "learning_rate": 7.737816117462751e-07, + "loss": 0.1964, + "num_tokens": 13942152.0, + "reward": 0.74755859375, + "reward_std": 0.006185316480696201, + "rewards//mean": 0.74755859375, + "rewards//std": 0.028726045042276382, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3228, + "grad_norm": 2.5669498443603516, + "kl": 1.7307023089379072, + "learning_rate": 7.735160240366274e-07, + "loss": 0.1731, + "num_tokens": 13950896.0, + "reward": 0.7427978515625, + "reward_std": 0.009273719973862171, + "rewards//mean": 0.7427978515625, + "rewards//std": 0.03390577435493469, + "step": 1614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.323, + "grad_norm": 6.909610748291016, + "kl": 2.189407590776682, + "learning_rate": 7.732503261547578e-07, + "loss": 0.2189, + "num_tokens": 13959576.0, + "reward": 0.783935546875, + "reward_std": 0.011407473124563694, + "rewards//mean": 0.783935546875, + "rewards//std": 0.031062457710504532, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3232, + "grad_norm": 2.96000337600708, + "kl": 2.104118162766099, + "learning_rate": 7.729845182076895e-07, + "loss": 0.2104, + "num_tokens": 13968200.0, + "reward": 0.78009033203125, + "reward_std": 0.01756000518798828, + "rewards//mean": 0.78009033203125, + "rewards//std": 0.03320170193910599, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3234, + "grad_norm": 10.360834121704102, + "kl": 1.9920466039329767, + "learning_rate": 7.7271860030249e-07, + "loss": 0.1992, + "num_tokens": 13976824.0, + "reward": 0.7193603515625, + "reward_std": 0.011293711140751839, + "rewards//mean": 0.7193603515625, + "rewards//std": 0.03919384628534317, + "step": 1617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3236, + "grad_norm": 8.590371131896973, + "kl": 2.300953108817339, + "learning_rate": 7.72452572546271e-07, + "loss": 0.2301, + "num_tokens": 13985536.0, + "reward": 0.756103515625, + "reward_std": 0.010012295097112656, + "rewards//mean": 0.756103515625, + "rewards//std": 0.037986643612384796, + "step": 1618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3238, + "grad_norm": 10.4971923828125, + "kl": 3.008170459419489, + "learning_rate": 7.721864350461882e-07, + "loss": 0.3008, + "num_tokens": 13994200.0, + "reward": 0.77392578125, + "reward_std": 0.01783689856529236, + "rewards//mean": 0.77392578125, + "rewards//std": 0.04572148621082306, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.324, + "grad_norm": 5.43795919418335, + "kl": 1.2357711791992188, + "learning_rate": 7.71920187909442e-07, + "loss": 0.1236, + "num_tokens": 14002800.0, + "reward": 0.76153564453125, + "reward_std": 0.010449150577187538, + "rewards//mean": 0.76153564453125, + "rewards//std": 0.03423428535461426, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3242, + "grad_norm": 4.845322132110596, + "kl": 1.9339684657752514, + "learning_rate": 7.716538312432765e-07, + "loss": 0.1934, + "num_tokens": 14011320.0, + "reward": 0.735595703125, + "reward_std": 0.014104663394391537, + "rewards//mean": 0.735595703125, + "rewards//std": 0.03157288581132889, + "step": 1621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3244, + "grad_norm": 3.489070415496826, + "kl": 1.914293834939599, + "learning_rate": 7.713873651549804e-07, + "loss": 0.1914, + "num_tokens": 14019920.0, + "reward": 0.746337890625, + "reward_std": 0.013487953692674637, + "rewards//mean": 0.746337890625, + "rewards//std": 0.03299090638756752, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3246, + "grad_norm": 4.1307878494262695, + "kl": 1.3855783771723509, + "learning_rate": 7.71120789751886e-07, + "loss": 0.1386, + "num_tokens": 14028488.0, + "reward": 0.7540283203125, + "reward_std": 0.012606613337993622, + "rewards//mean": 0.7540283203125, + "rewards//std": 0.031141061335802078, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3248, + "grad_norm": 2.5597338676452637, + "kl": 1.7195310927927494, + "learning_rate": 7.7085410514137e-07, + "loss": 0.172, + "num_tokens": 14037240.0, + "reward": 0.74755859375, + "reward_std": 0.013452245853841305, + "rewards//mean": 0.74755859375, + "rewards//std": 0.0396929532289505, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.325, + "grad_norm": 2.0841336250305176, + "kl": 1.7714228797703981, + "learning_rate": 7.705873114308527e-07, + "loss": 0.1771, + "num_tokens": 14045912.0, + "reward": 0.75604248046875, + "reward_std": 0.012261072173714638, + "rewards//mean": 0.75604248046875, + "rewards//std": 0.03111741691827774, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3252, + "grad_norm": 3.2604620456695557, + "kl": 1.9581709802150726, + "learning_rate": 7.703204087277988e-07, + "loss": 0.1958, + "num_tokens": 14054608.0, + "reward": 0.75244140625, + "reward_std": 0.011977639980614185, + "rewards//mean": 0.75244140625, + "rewards//std": 0.026009095832705498, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3254, + "grad_norm": 4.069620609283447, + "kl": 1.4125154428184032, + "learning_rate": 7.700533971397165e-07, + "loss": 0.1413, + "num_tokens": 14063248.0, + "reward": 0.7188720703125, + "reward_std": 0.009850779548287392, + "rewards//mean": 0.7188720703125, + "rewards//std": 0.03520766645669937, + "step": 1627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3256, + "grad_norm": 13.517831802368164, + "kl": 1.311868930235505, + "learning_rate": 7.697862767741583e-07, + "loss": 0.1312, + "num_tokens": 14071864.0, + "reward": 0.7684326171875, + "reward_std": 0.01460731215775013, + "rewards//mean": 0.7684326171875, + "rewards//std": 0.034262850880622864, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3258, + "grad_norm": 2.5224111080169678, + "kl": 1.492361443117261, + "learning_rate": 7.695190477387199e-07, + "loss": 0.1492, + "num_tokens": 14080496.0, + "reward": 0.75189208984375, + "reward_std": 0.01079997792840004, + "rewards//mean": 0.75189208984375, + "rewards//std": 0.029846590012311935, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.326, + "grad_norm": 2.7099499702453613, + "kl": 1.025027807801962, + "learning_rate": 7.692517101410414e-07, + "loss": 0.1025, + "num_tokens": 14089224.0, + "reward": 0.77435302734375, + "reward_std": 0.00937599316239357, + "rewards//mean": 0.77435302734375, + "rewards//std": 0.026178419589996338, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3262, + "grad_norm": 6.866759300231934, + "kl": 1.465584084391594, + "learning_rate": 7.689842640888063e-07, + "loss": 0.1466, + "num_tokens": 14097808.0, + "reward": 0.75164794921875, + "reward_std": 0.0056014652363955975, + "rewards//mean": 0.75164794921875, + "rewards//std": 0.02820965275168419, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3264, + "grad_norm": 2.365687847137451, + "kl": 1.803080828860402, + "learning_rate": 7.687167096897418e-07, + "loss": 0.1803, + "num_tokens": 14106552.0, + "reward": 0.7303466796875, + "reward_std": 0.014007436111569405, + "rewards//mean": 0.7303466796875, + "rewards//std": 0.029338955879211426, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3266, + "grad_norm": 2.817216157913208, + "kl": 1.9862238634377718, + "learning_rate": 7.684490470516185e-07, + "loss": 0.1986, + "num_tokens": 14115120.0, + "reward": 0.76763916015625, + "reward_std": 0.01972937025129795, + "rewards//mean": 0.76763916015625, + "rewards//std": 0.033414848148822784, + "step": 1633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3268, + "grad_norm": 5.252283096313477, + "kl": 2.063719341531396, + "learning_rate": 7.681812762822515e-07, + "loss": 0.2064, + "num_tokens": 14123784.0, + "reward": 0.7532958984375, + "reward_std": 0.011641591787338257, + "rewards//mean": 0.7532958984375, + "rewards//std": 0.03930491954088211, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.327, + "grad_norm": 3.38424015045166, + "kl": 0.7838151380419731, + "learning_rate": 7.679133974894982e-07, + "loss": 0.0784, + "num_tokens": 14132368.0, + "reward": 0.75506591796875, + "reward_std": 0.0034234439954161644, + "rewards//mean": 0.75506591796875, + "rewards//std": 0.031208738684654236, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3272, + "grad_norm": 4.37358283996582, + "kl": 1.3049269057810307, + "learning_rate": 7.676454107812607e-07, + "loss": 0.1305, + "num_tokens": 14140984.0, + "reward": 0.78289794921875, + "reward_std": 0.010955949313938618, + "rewards//mean": 0.78289794921875, + "rewards//std": 0.032591529190540314, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3274, + "grad_norm": 10.67484188079834, + "kl": 2.8740222696214914, + "learning_rate": 7.673773162654836e-07, + "loss": 0.2874, + "num_tokens": 14149696.0, + "reward": 0.7432861328125, + "reward_std": 0.01724255643785, + "rewards//mean": 0.7432861328125, + "rewards//std": 0.039114974439144135, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3276, + "grad_norm": 3.0229294300079346, + "kl": 1.480850925669074, + "learning_rate": 7.671091140501555e-07, + "loss": 0.1481, + "num_tokens": 14158304.0, + "reward": 0.7454833984375, + "reward_std": 0.008684443309903145, + "rewards//mean": 0.7454833984375, + "rewards//std": 0.023253023624420166, + "step": 1638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3278, + "grad_norm": 7.474429130554199, + "kl": 1.0412239786237478, + "learning_rate": 7.668408042433081e-07, + "loss": 0.1041, + "num_tokens": 14166912.0, + "reward": 0.75970458984375, + "reward_std": 0.013540440239012241, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.03504233807325363, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.328, + "grad_norm": 4.839244365692139, + "kl": 1.6017267871648073, + "learning_rate": 7.665723869530169e-07, + "loss": 0.1602, + "num_tokens": 14175480.0, + "reward": 0.77215576171875, + "reward_std": 0.01985972747206688, + "rewards//mean": 0.77215576171875, + "rewards//std": 0.03297019377350807, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3282, + "grad_norm": 2.8047549724578857, + "kl": 0.9350009337067604, + "learning_rate": 7.663038622873999e-07, + "loss": 0.0935, + "num_tokens": 14184088.0, + "reward": 0.74517822265625, + "reward_std": 0.007546941749751568, + "rewards//mean": 0.74517822265625, + "rewards//std": 0.03274675831198692, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3284, + "grad_norm": 3.22479510307312, + "kl": 1.7214165013283491, + "learning_rate": 7.660352303546192e-07, + "loss": 0.1721, + "num_tokens": 14192728.0, + "reward": 0.71905517578125, + "reward_std": 0.016738075762987137, + "rewards//mean": 0.71905517578125, + "rewards//std": 0.04340509697794914, + "step": 1642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3286, + "grad_norm": 9.772603034973145, + "kl": 1.9997139070183039, + "learning_rate": 7.657664912628794e-07, + "loss": 0.2, + "num_tokens": 14201512.0, + "reward": 0.7371826171875, + "reward_std": 0.007628243882209063, + "rewards//mean": 0.7371826171875, + "rewards//std": 0.03613448888063431, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3288, + "grad_norm": 2.8274033069610596, + "kl": 1.032218774780631, + "learning_rate": 7.654976451204287e-07, + "loss": 0.1032, + "num_tokens": 14210072.0, + "reward": 0.733642578125, + "reward_std": 0.0062283845618367195, + "rewards//mean": 0.733642578125, + "rewards//std": 0.032732944935560226, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.329, + "grad_norm": 4.682992935180664, + "kl": 1.2679704055190086, + "learning_rate": 7.652286920355583e-07, + "loss": 0.1268, + "num_tokens": 14218784.0, + "reward": 0.7537841796875, + "reward_std": 0.012609190307557583, + "rewards//mean": 0.7537841796875, + "rewards//std": 0.034694869071245193, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3292, + "grad_norm": 6.466736793518066, + "kl": 1.5422461535781622, + "learning_rate": 7.649596321166024e-07, + "loss": 0.1542, + "num_tokens": 14227544.0, + "reward": 0.76214599609375, + "reward_std": 0.007505156099796295, + "rewards//mean": 0.76214599609375, + "rewards//std": 0.030290622264146805, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3294, + "grad_norm": 6.936708927154541, + "kl": 2.0178975593298674, + "learning_rate": 7.646904654719385e-07, + "loss": 0.2018, + "num_tokens": 14236200.0, + "reward": 0.75567626953125, + "reward_std": 0.013023676350712776, + "rewards//mean": 0.75567626953125, + "rewards//std": 0.03307928517460823, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3296, + "grad_norm": 3.4528238773345947, + "kl": 1.3103303592652082, + "learning_rate": 7.644211922099867e-07, + "loss": 0.131, + "num_tokens": 14244816.0, + "reward": 0.74505615234375, + "reward_std": 0.009770406410098076, + "rewards//mean": 0.74505615234375, + "rewards//std": 0.024678243324160576, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3298, + "grad_norm": 3.8525946140289307, + "kl": 0.5916859656572342, + "learning_rate": 7.641518124392103e-07, + "loss": 0.0592, + "num_tokens": 14253456.0, + "reward": 0.7496337890625, + "reward_std": 0.0044884709641337395, + "rewards//mean": 0.7496337890625, + "rewards//std": 0.022474413737654686, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.33, + "grad_norm": 3.8584213256835938, + "kl": 1.8569832909852266, + "learning_rate": 7.638823262681154e-07, + "loss": 0.1857, + "num_tokens": 14262040.0, + "reward": 0.77972412109375, + "reward_std": 0.01577387936413288, + "rewards//mean": 0.77972412109375, + "rewards//std": 0.038378290832042694, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3302, + "grad_norm": 4.530097961425781, + "kl": 0.9330780766904354, + "learning_rate": 7.636127338052511e-07, + "loss": 0.0933, + "num_tokens": 14270656.0, + "reward": 0.75543212890625, + "reward_std": 0.007381060626357794, + "rewards//mean": 0.75543212890625, + "rewards//std": 0.030150368809700012, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3304, + "grad_norm": 4.290153503417969, + "kl": 1.5832807626575232, + "learning_rate": 7.633430351592093e-07, + "loss": 0.1583, + "num_tokens": 14279280.0, + "reward": 0.75518798828125, + "reward_std": 0.013890420086681843, + "rewards//mean": 0.75518798828125, + "rewards//std": 0.03930390998721123, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3306, + "grad_norm": 4.185068607330322, + "kl": 1.0172320175915956, + "learning_rate": 7.630732304386243e-07, + "loss": 0.1017, + "num_tokens": 14287896.0, + "reward": 0.77313232421875, + "reward_std": 0.012001155875623226, + "rewards//mean": 0.77313232421875, + "rewards//std": 0.029530974105000496, + "step": 1653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3308, + "grad_norm": 2.844931125640869, + "kl": 2.5025689974427223, + "learning_rate": 7.628033197521735e-07, + "loss": 0.2503, + "num_tokens": 14296560.0, + "reward": 0.7325439453125, + "reward_std": 0.010810887441039085, + "rewards//mean": 0.7325439453125, + "rewards//std": 0.025477223098278046, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.331, + "grad_norm": 3.049485921859741, + "kl": 1.1164589691907167, + "learning_rate": 7.625333032085769e-07, + "loss": 0.1116, + "num_tokens": 14305192.0, + "reward": 0.73834228515625, + "reward_std": 0.005308812949806452, + "rewards//mean": 0.73834228515625, + "rewards//std": 0.02388272061944008, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3312, + "grad_norm": 3.390105962753296, + "kl": 2.4483199659734964, + "learning_rate": 7.622631809165972e-07, + "loss": 0.2448, + "num_tokens": 14313848.0, + "reward": 0.75872802734375, + "reward_std": 0.0175870880484581, + "rewards//mean": 0.75872802734375, + "rewards//std": 0.035954900085926056, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3314, + "grad_norm": 3.059648275375366, + "kl": 0.9026069082319736, + "learning_rate": 7.619929529850396e-07, + "loss": 0.0903, + "num_tokens": 14322432.0, + "reward": 0.769287109375, + "reward_std": 0.010657286271452904, + "rewards//mean": 0.769287109375, + "rewards//std": 0.030843360349535942, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3316, + "grad_norm": 1.5156716108322144, + "kl": 1.1438779421150684, + "learning_rate": 7.617226195227517e-07, + "loss": 0.1144, + "num_tokens": 14331064.0, + "reward": 0.77337646484375, + "reward_std": 0.008215641602873802, + "rewards//mean": 0.77337646484375, + "rewards//std": 0.02871754765510559, + "step": 1658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3318, + "grad_norm": 4.749110698699951, + "kl": 1.5000360701233149, + "learning_rate": 7.614521806386243e-07, + "loss": 0.15, + "num_tokens": 14339672.0, + "reward": 0.75054931640625, + "reward_std": 0.010491067543625832, + "rewards//mean": 0.75054931640625, + "rewards//std": 0.0165416169911623, + "step": 1659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.332, + "grad_norm": 4.983676433563232, + "kl": 1.2771404217928648, + "learning_rate": 7.611816364415895e-07, + "loss": 0.1277, + "num_tokens": 14348360.0, + "reward": 0.74749755859375, + "reward_std": 0.011189857497811317, + "rewards//mean": 0.74749755859375, + "rewards//std": 0.04224032908678055, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3322, + "grad_norm": 3.840453863143921, + "kl": 0.8378145918250084, + "learning_rate": 7.60910987040623e-07, + "loss": 0.0838, + "num_tokens": 14356952.0, + "reward": 0.7608642578125, + "reward_std": 0.006858887150883675, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.025079594925045967, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3324, + "grad_norm": 11.438995361328125, + "kl": 2.391136320307851, + "learning_rate": 7.606402325447419e-07, + "loss": 0.2391, + "num_tokens": 14365712.0, + "reward": 0.7235107421875, + "reward_std": 0.012360994704067707, + "rewards//mean": 0.7235107421875, + "rewards//std": 0.036567576229572296, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3326, + "grad_norm": 4.214688301086426, + "kl": 1.569329358637333, + "learning_rate": 7.603693730630066e-07, + "loss": 0.1569, + "num_tokens": 14374352.0, + "reward": 0.7423095703125, + "reward_std": 0.008905535563826561, + "rewards//mean": 0.7423095703125, + "rewards//std": 0.03470184653997421, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3328, + "grad_norm": 3.899890899658203, + "kl": 1.8594390582293272, + "learning_rate": 7.600984087045186e-07, + "loss": 0.1859, + "num_tokens": 14383008.0, + "reward": 0.74951171875, + "reward_std": 0.009215177968144417, + "rewards//mean": 0.74951171875, + "rewards//std": 0.03464968129992485, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.333, + "grad_norm": 1.095352053642273, + "kl": 0.7407671269029379, + "learning_rate": 7.598273395784229e-07, + "loss": 0.0741, + "num_tokens": 14391616.0, + "reward": 0.79248046875, + "reward_std": 0.0021278527565300465, + "rewards//mean": 0.79248046875, + "rewards//std": 0.02507023885846138, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3332, + "grad_norm": 4.591573715209961, + "kl": 1.305159417912364, + "learning_rate": 7.59556165793906e-07, + "loss": 0.1305, + "num_tokens": 14400200.0, + "reward": 0.72979736328125, + "reward_std": 0.0060786036774516106, + "rewards//mean": 0.72979736328125, + "rewards//std": 0.03513595461845398, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3334, + "grad_norm": 3.2072863578796387, + "kl": 0.8619169108569622, + "learning_rate": 7.592848874601963e-07, + "loss": 0.0862, + "num_tokens": 14408840.0, + "reward": 0.76300048828125, + "reward_std": 0.00585097074508667, + "rewards//mean": 0.76300048828125, + "rewards//std": 0.030481424182653427, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3336, + "grad_norm": 3.877617835998535, + "kl": 1.613358285278082, + "learning_rate": 7.590135046865651e-07, + "loss": 0.1613, + "num_tokens": 14417464.0, + "reward": 0.77740478515625, + "reward_std": 0.01245960034430027, + "rewards//mean": 0.77740478515625, + "rewards//std": 0.03658759966492653, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3338, + "grad_norm": 2.6459970474243164, + "kl": 1.7920551113784313, + "learning_rate": 7.587420175823252e-07, + "loss": 0.1792, + "num_tokens": 14426064.0, + "reward": 0.743408203125, + "reward_std": 0.011848630383610725, + "rewards//mean": 0.743408203125, + "rewards//std": 0.03310084342956543, + "step": 1669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.334, + "grad_norm": 5.083470344543457, + "kl": 1.1938681211322546, + "learning_rate": 7.584704262568314e-07, + "loss": 0.1194, + "num_tokens": 14434632.0, + "reward": 0.754638671875, + "reward_std": 0.008189452812075615, + "rewards//mean": 0.754638671875, + "rewards//std": 0.030176525935530663, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3342, + "grad_norm": 1.880113124847412, + "kl": 0.9313200451433659, + "learning_rate": 7.581987308194809e-07, + "loss": 0.0931, + "num_tokens": 14443280.0, + "reward": 0.75433349609375, + "reward_std": 0.0029908069409430027, + "rewards//mean": 0.75433349609375, + "rewards//std": 0.030860962346196175, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3344, + "grad_norm": 3.1007273197174072, + "kl": 1.3674478232860565, + "learning_rate": 7.579269313797125e-07, + "loss": 0.1367, + "num_tokens": 14451808.0, + "reward": 0.72784423828125, + "reward_std": 0.008877030573785305, + "rewards//mean": 0.72784423828125, + "rewards//std": 0.027081677690148354, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3346, + "grad_norm": 8.102618217468262, + "kl": 1.8559861723333597, + "learning_rate": 7.576550280470071e-07, + "loss": 0.1856, + "num_tokens": 14460544.0, + "reward": 0.76214599609375, + "reward_std": 0.014675160869956017, + "rewards//mean": 0.76214599609375, + "rewards//std": 0.03505011275410652, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3348, + "grad_norm": 2.681720733642578, + "kl": 1.2322614286094904, + "learning_rate": 7.573830209308872e-07, + "loss": 0.1232, + "num_tokens": 14469176.0, + "reward": 0.73406982421875, + "reward_std": 0.004265114665031433, + "rewards//mean": 0.73406982421875, + "rewards//std": 0.032371558248996735, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.335, + "grad_norm": 8.004125595092773, + "kl": 1.0956176165491343, + "learning_rate": 7.57110910140917e-07, + "loss": 0.1096, + "num_tokens": 14477800.0, + "reward": 0.7530517578125, + "reward_std": 0.011142227798700333, + "rewards//mean": 0.7530517578125, + "rewards//std": 0.026461174711585045, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3352, + "grad_norm": 5.298357963562012, + "kl": 1.3848022278398275, + "learning_rate": 7.568386957867032e-07, + "loss": 0.1385, + "num_tokens": 14486440.0, + "reward": 0.76104736328125, + "reward_std": 0.006294413469731808, + "rewards//mean": 0.76104736328125, + "rewards//std": 0.0307936891913414, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3354, + "grad_norm": 5.516765117645264, + "kl": 0.8469223249703646, + "learning_rate": 7.565663779778933e-07, + "loss": 0.0847, + "num_tokens": 14495040.0, + "reward": 0.73773193359375, + "reward_std": 0.004481610842049122, + "rewards//mean": 0.73773193359375, + "rewards//std": 0.021596727892756462, + "step": 1677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3356, + "grad_norm": 4.592514991760254, + "kl": 1.4531228300184011, + "learning_rate": 7.562939568241771e-07, + "loss": 0.1453, + "num_tokens": 14503640.0, + "reward": 0.75836181640625, + "reward_std": 0.010041479021310806, + "rewards//mean": 0.75836181640625, + "rewards//std": 0.018634773790836334, + "step": 1678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3358, + "grad_norm": 8.456008911132812, + "kl": 1.7594390772283077, + "learning_rate": 7.560214324352858e-07, + "loss": 0.1759, + "num_tokens": 14512272.0, + "reward": 0.73541259765625, + "reward_std": 0.005506291054189205, + "rewards//mean": 0.73541259765625, + "rewards//std": 0.028307681903243065, + "step": 1679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.336, + "grad_norm": 4.635588645935059, + "kl": 1.3168210051953793, + "learning_rate": 7.55748804920992e-07, + "loss": 0.1317, + "num_tokens": 14520976.0, + "reward": 0.75030517578125, + "reward_std": 0.010580218397080898, + "rewards//mean": 0.75030517578125, + "rewards//std": 0.03075236827135086, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3362, + "grad_norm": 1.6931532621383667, + "kl": 1.3724579811096191, + "learning_rate": 7.554760743911103e-07, + "loss": 0.1372, + "num_tokens": 14529696.0, + "reward": 0.737548828125, + "reward_std": 0.007787193171679974, + "rewards//mean": 0.737548828125, + "rewards//std": 0.03272554278373718, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3364, + "grad_norm": 1.35236394405365, + "kl": 0.5628429744392633, + "learning_rate": 7.552032409554962e-07, + "loss": 0.0563, + "num_tokens": 14538296.0, + "reward": 0.770751953125, + "reward_std": 0.002668093889951706, + "rewards//mean": 0.770751953125, + "rewards//std": 0.017453676089644432, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3366, + "grad_norm": 2.2473065853118896, + "kl": 1.3161607328802347, + "learning_rate": 7.549303047240474e-07, + "loss": 0.1316, + "num_tokens": 14546904.0, + "reward": 0.78643798828125, + "reward_std": 0.00971379317343235, + "rewards//mean": 0.78643798828125, + "rewards//std": 0.02911592461168766, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3368, + "grad_norm": 2.310183525085449, + "kl": 0.814789243042469, + "learning_rate": 7.54657265806702e-07, + "loss": 0.0815, + "num_tokens": 14555512.0, + "reward": 0.77813720703125, + "reward_std": 0.007041408680379391, + "rewards//mean": 0.77813720703125, + "rewards//std": 0.029249753803014755, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.337, + "grad_norm": 9.518553733825684, + "kl": 1.6211869530379772, + "learning_rate": 7.543841243134408e-07, + "loss": 0.1621, + "num_tokens": 14564144.0, + "reward": 0.75732421875, + "reward_std": 0.010654858313500881, + "rewards//mean": 0.75732421875, + "rewards//std": 0.03877311199903488, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3372, + "grad_norm": 4.3729329109191895, + "kl": 1.1276446674019098, + "learning_rate": 7.541108803542845e-07, + "loss": 0.1128, + "num_tokens": 14572752.0, + "reward": 0.760498046875, + "reward_std": 0.010988160036504269, + "rewards//mean": 0.760498046875, + "rewards//std": 0.035861365497112274, + "step": 1686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3374, + "grad_norm": 4.076907157897949, + "kl": 1.7543259430676699, + "learning_rate": 7.538375340392961e-07, + "loss": 0.1754, + "num_tokens": 14581416.0, + "reward": 0.71630859375, + "reward_std": 0.01102924533188343, + "rewards//mean": 0.71630859375, + "rewards//std": 0.037875697016716, + "step": 1687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3376, + "grad_norm": 5.360969066619873, + "kl": 1.605041479691863, + "learning_rate": 7.535640854785791e-07, + "loss": 0.1605, + "num_tokens": 14590024.0, + "reward": 0.7705078125, + "reward_std": 0.01736287772655487, + "rewards//mean": 0.7705078125, + "rewards//std": 0.03298448026180267, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3378, + "grad_norm": 3.594897508621216, + "kl": 0.8084434028714895, + "learning_rate": 7.532905347822791e-07, + "loss": 0.0808, + "num_tokens": 14598648.0, + "reward": 0.7225341796875, + "reward_std": 0.005750302225351334, + "rewards//mean": 0.7225341796875, + "rewards//std": 0.032271627336740494, + "step": 1689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.338, + "grad_norm": 9.191390991210938, + "kl": 1.743047945201397, + "learning_rate": 7.530168820605818e-07, + "loss": 0.1743, + "num_tokens": 14607320.0, + "reward": 0.74420166015625, + "reward_std": 0.00988863781094551, + "rewards//mean": 0.74420166015625, + "rewards//std": 0.03369634971022606, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3382, + "grad_norm": 3.980365514755249, + "kl": 1.3973311893641949, + "learning_rate": 7.527431274237149e-07, + "loss": 0.1397, + "num_tokens": 14616048.0, + "reward": 0.729248046875, + "reward_std": 0.009896701201796532, + "rewards//mean": 0.729248046875, + "rewards//std": 0.02880498208105564, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3384, + "grad_norm": 5.471235752105713, + "kl": 1.5794076099991798, + "learning_rate": 7.524692709819463e-07, + "loss": 0.1579, + "num_tokens": 14624664.0, + "reward": 0.7625732421875, + "reward_std": 0.017121773213148117, + "rewards//mean": 0.7625732421875, + "rewards//std": 0.02605532482266426, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3386, + "grad_norm": 19.024185180664062, + "kl": 1.6618862971663475, + "learning_rate": 7.521953128455855e-07, + "loss": 0.1662, + "num_tokens": 14633264.0, + "reward": 0.754150390625, + "reward_std": 0.004000131972134113, + "rewards//mean": 0.754150390625, + "rewards//std": 0.027908140793442726, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3388, + "grad_norm": 8.094286918640137, + "kl": 2.2584478612989187, + "learning_rate": 7.519212531249829e-07, + "loss": 0.2258, + "num_tokens": 14641864.0, + "reward": 0.74407958984375, + "reward_std": 0.008400380611419678, + "rewards//mean": 0.74407958984375, + "rewards//std": 0.025429869070649147, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.339, + "grad_norm": 28.27859878540039, + "kl": 2.725379491224885, + "learning_rate": 7.516470919305298e-07, + "loss": 0.2725, + "num_tokens": 14650480.0, + "reward": 0.72296142578125, + "reward_std": 0.012959511019289494, + "rewards//mean": 0.72296142578125, + "rewards//std": 0.02918447181582451, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3392, + "grad_norm": 20.440521240234375, + "kl": 1.9849905855953693, + "learning_rate": 7.513728293726579e-07, + "loss": 0.1985, + "num_tokens": 14659040.0, + "reward": 0.781005859375, + "reward_std": 0.012903118506073952, + "rewards//mean": 0.781005859375, + "rewards//std": 0.024238893762230873, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3394, + "grad_norm": 25.964466094970703, + "kl": 1.5389139335602522, + "learning_rate": 7.510984655618406e-07, + "loss": 0.1539, + "num_tokens": 14667672.0, + "reward": 0.7200927734375, + "reward_std": 0.006359969265758991, + "rewards//mean": 0.7200927734375, + "rewards//std": 0.029258355498313904, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3396, + "grad_norm": 9.197903633117676, + "kl": 1.9400007743388414, + "learning_rate": 7.508240006085913e-07, + "loss": 0.194, + "num_tokens": 14676360.0, + "reward": 0.75677490234375, + "reward_std": 0.007967963814735413, + "rewards//mean": 0.75677490234375, + "rewards//std": 0.025782210752367973, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3398, + "grad_norm": 15.065352439880371, + "kl": 2.113749247044325, + "learning_rate": 7.505494346234647e-07, + "loss": 0.2114, + "num_tokens": 14684928.0, + "reward": 0.7586669921875, + "reward_std": 0.014345825649797916, + "rewards//mean": 0.7586669921875, + "rewards//std": 0.032980579882860184, + "step": 1699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.34, + "grad_norm": 19.388952255249023, + "kl": 3.091250417754054, + "learning_rate": 7.502747677170555e-07, + "loss": 0.3091, + "num_tokens": 14693568.0, + "reward": 0.738525390625, + "reward_std": 0.013732793740928173, + "rewards//mean": 0.738525390625, + "rewards//std": 0.03740192577242851, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3402, + "grad_norm": 32.21320724487305, + "kl": 3.8705898970365524, + "learning_rate": 7.5e-07, + "loss": 0.3871, + "num_tokens": 14702208.0, + "reward": 0.73223876953125, + "reward_std": 0.019241712987422943, + "rewards//mean": 0.73223876953125, + "rewards//std": 0.06249197944998741, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3404, + "grad_norm": 7.169406414031982, + "kl": 1.371338753029704, + "learning_rate": 7.497251315829743e-07, + "loss": 0.1371, + "num_tokens": 14710888.0, + "reward": 0.76019287109375, + "reward_std": 0.011022357270121574, + "rewards//mean": 0.76019287109375, + "rewards//std": 0.0305731613188982, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3406, + "grad_norm": 7.21544075012207, + "kl": 0.9435538854449987, + "learning_rate": 7.494501625766955e-07, + "loss": 0.0944, + "num_tokens": 14719608.0, + "reward": 0.7705078125, + "reward_std": 0.0077237319201231, + "rewards//mean": 0.7705078125, + "rewards//std": 0.030233660712838173, + "step": 1703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3408, + "grad_norm": 1.1966946125030518, + "kl": 0.7175771556794643, + "learning_rate": 7.491750930919212e-07, + "loss": 0.0718, + "num_tokens": 14728232.0, + "reward": 0.7764892578125, + "reward_std": 0.0036823658738285303, + "rewards//mean": 0.7764892578125, + "rewards//std": 0.025358112528920174, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.341, + "grad_norm": 2.9344372749328613, + "kl": 0.6410322058945894, + "learning_rate": 7.488999232394491e-07, + "loss": 0.0641, + "num_tokens": 14736920.0, + "reward": 0.7530517578125, + "reward_std": 0.003432015422731638, + "rewards//mean": 0.7530517578125, + "rewards//std": 0.025424884632229805, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3412, + "grad_norm": 4.979156970977783, + "kl": 1.1780236940830946, + "learning_rate": 7.486246531301177e-07, + "loss": 0.1178, + "num_tokens": 14745584.0, + "reward": 0.7303466796875, + "reward_std": 0.011336080729961395, + "rewards//mean": 0.7303466796875, + "rewards//std": 0.0459880605340004, + "step": 1706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3414, + "grad_norm": 4.931070804595947, + "kl": 1.3935081399977207, + "learning_rate": 7.483492828748056e-07, + "loss": 0.1394, + "num_tokens": 14754312.0, + "reward": 0.7525634765625, + "reward_std": 0.013016480952501297, + "rewards//mean": 0.7525634765625, + "rewards//std": 0.026532016694545746, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3416, + "grad_norm": 1.5224446058273315, + "kl": 0.5527310520410538, + "learning_rate": 7.480738125844322e-07, + "loss": 0.0553, + "num_tokens": 14762928.0, + "reward": 0.7354736328125, + "reward_std": 0.001750173862092197, + "rewards//mean": 0.7354736328125, + "rewards//std": 0.027370035648345947, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3418, + "grad_norm": 1.7535755634307861, + "kl": 0.5227664280682802, + "learning_rate": 7.477982423699567e-07, + "loss": 0.0523, + "num_tokens": 14771584.0, + "reward": 0.77703857421875, + "reward_std": 0.004122797399759293, + "rewards//mean": 0.77703857421875, + "rewards//std": 0.024835387244820595, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.342, + "grad_norm": 1.2541731595993042, + "kl": 0.576104111969471, + "learning_rate": 7.475225723423788e-07, + "loss": 0.0576, + "num_tokens": 14780264.0, + "reward": 0.76513671875, + "reward_std": 0.003349718637764454, + "rewards//mean": 0.76513671875, + "rewards//std": 0.021760214120149612, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3422, + "grad_norm": 4.6338725090026855, + "kl": 1.3434299621731043, + "learning_rate": 7.472468026127384e-07, + "loss": 0.1343, + "num_tokens": 14789040.0, + "reward": 0.774658203125, + "reward_std": 0.010380629450082779, + "rewards//mean": 0.774658203125, + "rewards//std": 0.028055289760231972, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3424, + "grad_norm": 4.566206455230713, + "kl": 0.7673952169716358, + "learning_rate": 7.469709332921154e-07, + "loss": 0.0767, + "num_tokens": 14797600.0, + "reward": 0.74444580078125, + "reward_std": 0.007673834916204214, + "rewards//mean": 0.74444580078125, + "rewards//std": 0.030063383281230927, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3426, + "grad_norm": 1.81694757938385, + "kl": 1.1699585411697626, + "learning_rate": 7.4669496449163e-07, + "loss": 0.117, + "num_tokens": 14806248.0, + "reward": 0.74298095703125, + "reward_std": 0.005607170984148979, + "rewards//mean": 0.74298095703125, + "rewards//std": 0.01837710663676262, + "step": 1713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3428, + "grad_norm": 6.147146224975586, + "kl": 1.0212355516850948, + "learning_rate": 7.464188963224427e-07, + "loss": 0.1021, + "num_tokens": 14814856.0, + "reward": 0.7528076171875, + "reward_std": 0.010117903351783752, + "rewards//mean": 0.7528076171875, + "rewards//std": 0.030028430745005608, + "step": 1714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.343, + "grad_norm": 2.5232460498809814, + "kl": 0.9998667482286692, + "learning_rate": 7.461427288957531e-07, + "loss": 0.1, + "num_tokens": 14823456.0, + "reward": 0.75396728515625, + "reward_std": 0.00727267749607563, + "rewards//mean": 0.75396728515625, + "rewards//std": 0.025718722492456436, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3432, + "grad_norm": 1.8526498079299927, + "kl": 0.9887200873345137, + "learning_rate": 7.45866462322802e-07, + "loss": 0.0989, + "num_tokens": 14832032.0, + "reward": 0.77264404296875, + "reward_std": 0.003451541531831026, + "rewards//mean": 0.77264404296875, + "rewards//std": 0.016070950776338577, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3434, + "grad_norm": 1.2874479293823242, + "kl": 1.0913045555353165, + "learning_rate": 7.45590096714869e-07, + "loss": 0.1091, + "num_tokens": 14840616.0, + "reward": 0.7208251953125, + "reward_std": 0.008463181555271149, + "rewards//mean": 0.7208251953125, + "rewards//std": 0.030048588290810585, + "step": 1717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3436, + "grad_norm": 1.046310305595398, + "kl": 0.6895448602735996, + "learning_rate": 7.453136321832745e-07, + "loss": 0.069, + "num_tokens": 14849224.0, + "reward": 0.7720947265625, + "reward_std": 0.0013896794989705086, + "rewards//mean": 0.7720947265625, + "rewards//std": 0.015659356489777565, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3438, + "grad_norm": 0.39746522903442383, + "kl": 0.4428255669772625, + "learning_rate": 7.450370688393784e-07, + "loss": 0.0443, + "num_tokens": 14857824.0, + "reward": 0.75555419921875, + "reward_std": 0.001083534792996943, + "rewards//mean": 0.75555419921875, + "rewards//std": 0.02159602753818035, + "step": 1719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.344, + "grad_norm": 1.386067509651184, + "kl": 1.1081818174570799, + "learning_rate": 7.447604067945802e-07, + "loss": 0.1108, + "num_tokens": 14866520.0, + "reward": 0.72930908203125, + "reward_std": 0.004850580357015133, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.026790568605065346, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3442, + "grad_norm": 4.141593933105469, + "kl": 1.4461893811821938, + "learning_rate": 7.444836461603194e-07, + "loss": 0.1446, + "num_tokens": 14875352.0, + "reward": 0.75823974609375, + "reward_std": 0.009514996781945229, + "rewards//mean": 0.75823974609375, + "rewards//std": 0.02587481215596199, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3444, + "grad_norm": 5.451181888580322, + "kl": 1.7047587744891644, + "learning_rate": 7.442067870480751e-07, + "loss": 0.1705, + "num_tokens": 14883920.0, + "reward": 0.73529052734375, + "reward_std": 0.010152137838304043, + "rewards//mean": 0.73529052734375, + "rewards//std": 0.038655757904052734, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3446, + "grad_norm": 4.6878886222839355, + "kl": 1.3176060393452644, + "learning_rate": 7.439298295693663e-07, + "loss": 0.1318, + "num_tokens": 14892592.0, + "reward": 0.743896484375, + "reward_std": 0.005385981872677803, + "rewards//mean": 0.743896484375, + "rewards//std": 0.036157313734292984, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3448, + "grad_norm": 2.1867740154266357, + "kl": 0.7468766365200281, + "learning_rate": 7.436527738357513e-07, + "loss": 0.0747, + "num_tokens": 14901192.0, + "reward": 0.76495361328125, + "reward_std": 0.00658496655523777, + "rewards//mean": 0.76495361328125, + "rewards//std": 0.03055681847035885, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.345, + "grad_norm": 3.0420939922332764, + "kl": 0.6787095963954926, + "learning_rate": 7.433756199588282e-07, + "loss": 0.0679, + "num_tokens": 14909840.0, + "reward": 0.778076171875, + "reward_std": 0.0028048825915902853, + "rewards//mean": 0.778076171875, + "rewards//std": 0.021675176918506622, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3452, + "grad_norm": 3.7577598094940186, + "kl": 1.7973673641681671, + "learning_rate": 7.430983680502343e-07, + "loss": 0.1797, + "num_tokens": 14918520.0, + "reward": 0.761962890625, + "reward_std": 0.014277311973273754, + "rewards//mean": 0.761962890625, + "rewards//std": 0.03215061128139496, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3454, + "grad_norm": 6.4812750816345215, + "kl": 1.8695234637707472, + "learning_rate": 7.42821018221647e-07, + "loss": 0.187, + "num_tokens": 14927264.0, + "reward": 0.77349853515625, + "reward_std": 0.011050630360841751, + "rewards//mean": 0.77349853515625, + "rewards//std": 0.04331118240952492, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3456, + "grad_norm": 1.821395993232727, + "kl": 1.505713665857911, + "learning_rate": 7.425435705847825e-07, + "loss": 0.1506, + "num_tokens": 14935976.0, + "reward": 0.76043701171875, + "reward_std": 0.009524603374302387, + "rewards//mean": 0.76043701171875, + "rewards//std": 0.02476458251476288, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3458, + "grad_norm": 5.484038829803467, + "kl": 1.577188765630126, + "learning_rate": 7.422660252513968e-07, + "loss": 0.1577, + "num_tokens": 14944624.0, + "reward": 0.74481201171875, + "reward_std": 0.007182367146015167, + "rewards//mean": 0.74481201171875, + "rewards//std": 0.02699209563434124, + "step": 1729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.346, + "grad_norm": 2.585268020629883, + "kl": 1.7433014456182718, + "learning_rate": 7.41988382333285e-07, + "loss": 0.1743, + "num_tokens": 14953312.0, + "reward": 0.74993896484375, + "reward_std": 0.011576471850275993, + "rewards//mean": 0.74993896484375, + "rewards//std": 0.029891187325119972, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3462, + "grad_norm": 3.90644907951355, + "kl": 1.3983549159020185, + "learning_rate": 7.417106419422818e-07, + "loss": 0.1398, + "num_tokens": 14961880.0, + "reward": 0.79949951171875, + "reward_std": 0.010875910520553589, + "rewards//mean": 0.79949951171875, + "rewards//std": 0.023959290236234665, + "step": 1731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3464, + "grad_norm": 2.3305535316467285, + "kl": 1.827426040545106, + "learning_rate": 7.41432804190261e-07, + "loss": 0.1827, + "num_tokens": 14970464.0, + "reward": 0.7171630859375, + "reward_std": 0.008515611290931702, + "rewards//mean": 0.7171630859375, + "rewards//std": 0.032156966626644135, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3466, + "grad_norm": 2.596547842025757, + "kl": 1.9944287464022636, + "learning_rate": 7.411548691891357e-07, + "loss": 0.1994, + "num_tokens": 14979112.0, + "reward": 0.76739501953125, + "reward_std": 0.011900435201823711, + "rewards//mean": 0.76739501953125, + "rewards//std": 0.036057062447071075, + "step": 1733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3468, + "grad_norm": 2.95515775680542, + "kl": 2.192312242463231, + "learning_rate": 7.408768370508576e-07, + "loss": 0.2192, + "num_tokens": 14987824.0, + "reward": 0.75433349609375, + "reward_std": 0.011328080669045448, + "rewards//mean": 0.75433349609375, + "rewards//std": 0.03066413104534149, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.347, + "grad_norm": 0.8474298119544983, + "kl": 0.791348984465003, + "learning_rate": 7.405987078874185e-07, + "loss": 0.0791, + "num_tokens": 14996432.0, + "reward": 0.75408935546875, + "reward_std": 0.003990694880485535, + "rewards//mean": 0.75408935546875, + "rewards//std": 0.03140794113278389, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3472, + "grad_norm": 3.539428472518921, + "kl": 1.5238278284668922, + "learning_rate": 7.403204818108487e-07, + "loss": 0.1524, + "num_tokens": 15005096.0, + "reward": 0.7723388671875, + "reward_std": 0.012794092297554016, + "rewards//mean": 0.7723388671875, + "rewards//std": 0.03929413482546806, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3474, + "grad_norm": 1.4129527807235718, + "kl": 1.3678991589695215, + "learning_rate": 7.400421589332174e-07, + "loss": 0.1368, + "num_tokens": 15013688.0, + "reward": 0.76708984375, + "reward_std": 0.009629899635910988, + "rewards//mean": 0.76708984375, + "rewards//std": 0.031369902193546295, + "step": 1737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3476, + "grad_norm": 1.8591399192810059, + "kl": 1.5847214739769697, + "learning_rate": 7.397637393666333e-07, + "loss": 0.1585, + "num_tokens": 15022464.0, + "reward": 0.74346923828125, + "reward_std": 0.007429173681885004, + "rewards//mean": 0.74346923828125, + "rewards//std": 0.025986313819885254, + "step": 1738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3478, + "grad_norm": 2.165801763534546, + "kl": 1.4061892107129097, + "learning_rate": 7.394852232232436e-07, + "loss": 0.1406, + "num_tokens": 15031144.0, + "reward": 0.78155517578125, + "reward_std": 0.011306533589959145, + "rewards//mean": 0.78155517578125, + "rewards//std": 0.02262035384774208, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.348, + "grad_norm": 1.9580681324005127, + "kl": 1.902137791737914, + "learning_rate": 7.392066106152345e-07, + "loss": 0.1902, + "num_tokens": 15039800.0, + "reward": 0.76641845703125, + "reward_std": 0.009985348209738731, + "rewards//mean": 0.76641845703125, + "rewards//std": 0.028384050354361534, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3482, + "grad_norm": 0.9027834534645081, + "kl": 0.7155110444873571, + "learning_rate": 7.389279016548316e-07, + "loss": 0.0716, + "num_tokens": 15048400.0, + "reward": 0.75201416015625, + "reward_std": 0.003574079368263483, + "rewards//mean": 0.75201416015625, + "rewards//std": 0.029528411105275154, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3484, + "grad_norm": 2.6010026931762695, + "kl": 2.065766640007496, + "learning_rate": 7.386490964542982e-07, + "loss": 0.2066, + "num_tokens": 15057024.0, + "reward": 0.7806396484375, + "reward_std": 0.012823672965168953, + "rewards//mean": 0.7806396484375, + "rewards//std": 0.03279462456703186, + "step": 1742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3486, + "grad_norm": 4.488386154174805, + "kl": 1.851673873141408, + "learning_rate": 7.383701951259375e-07, + "loss": 0.1852, + "num_tokens": 15065576.0, + "reward": 0.786376953125, + "reward_std": 0.007897108793258667, + "rewards//mean": 0.786376953125, + "rewards//std": 0.026281218975782394, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3488, + "grad_norm": 7.884819984436035, + "kl": 3.009798128157854, + "learning_rate": 7.380911977820906e-07, + "loss": 0.301, + "num_tokens": 15074264.0, + "reward": 0.73211669921875, + "reward_std": 0.012890150770545006, + "rewards//mean": 0.73211669921875, + "rewards//std": 0.027399539947509766, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.349, + "grad_norm": 1.4514238834381104, + "kl": 1.010293835774064, + "learning_rate": 7.378121045351377e-07, + "loss": 0.101, + "num_tokens": 15082896.0, + "reward": 0.74908447265625, + "reward_std": 0.002725997706875205, + "rewards//mean": 0.74908447265625, + "rewards//std": 0.019284680485725403, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3492, + "grad_norm": 5.77423620223999, + "kl": 2.6011885572224855, + "learning_rate": 7.375329154974975e-07, + "loss": 0.2601, + "num_tokens": 15091528.0, + "reward": 0.7545166015625, + "reward_std": 0.014566123485565186, + "rewards//mean": 0.7545166015625, + "rewards//std": 0.02837584912776947, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3494, + "grad_norm": 3.199122667312622, + "kl": 2.118050239980221, + "learning_rate": 7.372536307816272e-07, + "loss": 0.2118, + "num_tokens": 15100120.0, + "reward": 0.7659912109375, + "reward_std": 0.009744489565491676, + "rewards//mean": 0.7659912109375, + "rewards//std": 0.02130145952105522, + "step": 1747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3496, + "grad_norm": 2.7947630882263184, + "kl": 1.1547651756554842, + "learning_rate": 7.369742505000231e-07, + "loss": 0.1155, + "num_tokens": 15108744.0, + "reward": 0.76104736328125, + "reward_std": 0.007997667416930199, + "rewards//mean": 0.76104736328125, + "rewards//std": 0.03725060448050499, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3498, + "grad_norm": 1.3045270442962646, + "kl": 1.191990939900279, + "learning_rate": 7.366947747652191e-07, + "loss": 0.1192, + "num_tokens": 15117312.0, + "reward": 0.73974609375, + "reward_std": 0.008499959483742714, + "rewards//mean": 0.73974609375, + "rewards//std": 0.027040911838412285, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.35, + "grad_norm": 7.094205379486084, + "kl": 0.9772929809987545, + "learning_rate": 7.364152036897882e-07, + "loss": 0.0977, + "num_tokens": 15125920.0, + "reward": 0.747314453125, + "reward_std": 0.006368104834109545, + "rewards//mean": 0.747314453125, + "rewards//std": 0.03124128095805645, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3502, + "grad_norm": 1.8389047384262085, + "kl": 1.2565936334431171, + "learning_rate": 7.361355373863413e-07, + "loss": 0.1257, + "num_tokens": 15134488.0, + "reward": 0.740234375, + "reward_std": 0.006755336653441191, + "rewards//mean": 0.740234375, + "rewards//std": 0.03456920012831688, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3504, + "grad_norm": 2.2567107677459717, + "kl": 1.6556364316493273, + "learning_rate": 7.358557759675284e-07, + "loss": 0.1656, + "num_tokens": 15143216.0, + "reward": 0.77337646484375, + "reward_std": 0.01561724953353405, + "rewards//mean": 0.77337646484375, + "rewards//std": 0.03251480311155319, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3506, + "grad_norm": 3.0326714515686035, + "kl": 2.143301162868738, + "learning_rate": 7.35575919546037e-07, + "loss": 0.2143, + "num_tokens": 15152040.0, + "reward": 0.73162841796875, + "reward_std": 0.011417457833886147, + "rewards//mean": 0.73162841796875, + "rewards//std": 0.041722897440195084, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3508, + "grad_norm": 6.754617214202881, + "kl": 0.7951330132782459, + "learning_rate": 7.352959682345935e-07, + "loss": 0.0795, + "num_tokens": 15160672.0, + "reward": 0.74951171875, + "reward_std": 0.004714501090347767, + "rewards//mean": 0.74951171875, + "rewards//std": 0.023556042462587357, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.351, + "grad_norm": 1.2019836902618408, + "kl": 0.9368169568479061, + "learning_rate": 7.350159221459621e-07, + "loss": 0.0937, + "num_tokens": 15169312.0, + "reward": 0.78302001953125, + "reward_std": 0.005903866142034531, + "rewards//mean": 0.78302001953125, + "rewards//std": 0.02406897395849228, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3512, + "grad_norm": 1.4883495569229126, + "kl": 1.3949045836925507, + "learning_rate": 7.347357813929454e-07, + "loss": 0.1395, + "num_tokens": 15177848.0, + "reward": 0.75537109375, + "reward_std": 0.004409474320709705, + "rewards//mean": 0.75537109375, + "rewards//std": 0.021298972889780998, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3514, + "grad_norm": 7.7861247062683105, + "kl": 1.6032751519232988, + "learning_rate": 7.344555460883839e-07, + "loss": 0.1603, + "num_tokens": 15186416.0, + "reward": 0.75677490234375, + "reward_std": 0.00679248571395874, + "rewards//mean": 0.75677490234375, + "rewards//std": 0.023075519129633904, + "step": 1757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3516, + "grad_norm": 14.671707153320312, + "kl": 1.3416200447827578, + "learning_rate": 7.341752163451567e-07, + "loss": 0.1342, + "num_tokens": 15195064.0, + "reward": 0.72930908203125, + "reward_std": 0.004718040581792593, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.028979387134313583, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3518, + "grad_norm": 3.320373058319092, + "kl": 1.6173847205936909, + "learning_rate": 7.338947922761802e-07, + "loss": 0.1617, + "num_tokens": 15203696.0, + "reward": 0.75665283203125, + "reward_std": 0.011801866814494133, + "rewards//mean": 0.75665283203125, + "rewards//std": 0.026527095586061478, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.352, + "grad_norm": 2.639523983001709, + "kl": 1.4315192308276892, + "learning_rate": 7.336142739944093e-07, + "loss": 0.1432, + "num_tokens": 15212416.0, + "reward": 0.76043701171875, + "reward_std": 0.011404254473745823, + "rewards//mean": 0.76043701171875, + "rewards//std": 0.031180594116449356, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3522, + "grad_norm": 0.6230894327163696, + "kl": 0.6868654675781727, + "learning_rate": 7.333336616128369e-07, + "loss": 0.0687, + "num_tokens": 15221032.0, + "reward": 0.75579833984375, + "reward_std": 0.0021794678177684546, + "rewards//mean": 0.75579833984375, + "rewards//std": 0.02194301225244999, + "step": 1761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3524, + "grad_norm": 3.316927194595337, + "kl": 1.6013391427695751, + "learning_rate": 7.330529552444932e-07, + "loss": 0.1601, + "num_tokens": 15229704.0, + "reward": 0.73968505859375, + "reward_std": 0.006840704008936882, + "rewards//mean": 0.73968505859375, + "rewards//std": 0.030124248936772346, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3526, + "grad_norm": 5.732491970062256, + "kl": 1.6784944124519825, + "learning_rate": 7.327721550024475e-07, + "loss": 0.1678, + "num_tokens": 15238400.0, + "reward": 0.72845458984375, + "reward_std": 0.010432370938360691, + "rewards//mean": 0.72845458984375, + "rewards//std": 0.02905711531639099, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3528, + "grad_norm": 3.6753923892974854, + "kl": 0.8355032317340374, + "learning_rate": 7.324912609998053e-07, + "loss": 0.0836, + "num_tokens": 15247072.0, + "reward": 0.73590087890625, + "reward_std": 0.005861599929630756, + "rewards//mean": 0.73590087890625, + "rewards//std": 0.02998119592666626, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.353, + "grad_norm": 2.246016263961792, + "kl": 2.139521811157465, + "learning_rate": 7.322102733497109e-07, + "loss": 0.214, + "num_tokens": 15255752.0, + "reward": 0.76531982421875, + "reward_std": 0.013181580230593681, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.035613518208265305, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3532, + "grad_norm": 3.0697615146636963, + "kl": 1.3367989528924227, + "learning_rate": 7.319291921653463e-07, + "loss": 0.1337, + "num_tokens": 15264384.0, + "reward": 0.74383544921875, + "reward_std": 0.003256019204854965, + "rewards//mean": 0.74383544921875, + "rewards//std": 0.026582960039377213, + "step": 1766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3534, + "grad_norm": 1.91382896900177, + "kl": 1.5459059569984674, + "learning_rate": 7.316480175599308e-07, + "loss": 0.1546, + "num_tokens": 15273048.0, + "reward": 0.77105712890625, + "reward_std": 0.009686676785349846, + "rewards//mean": 0.77105712890625, + "rewards//std": 0.028276648372411728, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3536, + "grad_norm": 1.2383081912994385, + "kl": 0.6192030981183052, + "learning_rate": 7.313667496467215e-07, + "loss": 0.0619, + "num_tokens": 15281632.0, + "reward": 0.75714111328125, + "reward_std": 0.004506496712565422, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.021413005888462067, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3538, + "grad_norm": 1.7629684209823608, + "kl": 1.344154804944992, + "learning_rate": 7.310853885390132e-07, + "loss": 0.1344, + "num_tokens": 15290240.0, + "reward": 0.77435302734375, + "reward_std": 0.005163392052054405, + "rewards//mean": 0.77435302734375, + "rewards//std": 0.021527927368879318, + "step": 1769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.354, + "grad_norm": 6.2378973960876465, + "kl": 2.1335525270551443, + "learning_rate": 7.308039343501379e-07, + "loss": 0.2134, + "num_tokens": 15298944.0, + "reward": 0.73681640625, + "reward_std": 0.011033887043595314, + "rewards//mean": 0.73681640625, + "rewards//std": 0.03596673905849457, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3542, + "grad_norm": 2.182572841644287, + "kl": 1.2096397168934345, + "learning_rate": 7.305223871934656e-07, + "loss": 0.121, + "num_tokens": 15307616.0, + "reward": 0.77587890625, + "reward_std": 0.007967408746480942, + "rewards//mean": 0.77587890625, + "rewards//std": 0.022846169769763947, + "step": 1771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3544, + "grad_norm": 4.277583122253418, + "kl": 1.1626602467149496, + "learning_rate": 7.302407471824033e-07, + "loss": 0.1163, + "num_tokens": 15316304.0, + "reward": 0.76617431640625, + "reward_std": 0.0031837287824600935, + "rewards//mean": 0.76617431640625, + "rewards//std": 0.0290310550481081, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3546, + "grad_norm": 2.832304000854492, + "kl": 1.7627367228269577, + "learning_rate": 7.299590144303954e-07, + "loss": 0.1763, + "num_tokens": 15324912.0, + "reward": 0.7637939453125, + "reward_std": 0.014823010191321373, + "rewards//mean": 0.7637939453125, + "rewards//std": 0.032234080135822296, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3548, + "grad_norm": 8.819801330566406, + "kl": 1.3549708854407072, + "learning_rate": 7.296771890509242e-07, + "loss": 0.1355, + "num_tokens": 15333488.0, + "reward": 0.772216796875, + "reward_std": 0.010270223021507263, + "rewards//mean": 0.772216796875, + "rewards//std": 0.03110920637845993, + "step": 1774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.355, + "grad_norm": 2.678743362426758, + "kl": 1.418468652293086, + "learning_rate": 7.293952711575086e-07, + "loss": 0.1418, + "num_tokens": 15342088.0, + "reward": 0.74639892578125, + "reward_std": 0.008328201249241829, + "rewards//mean": 0.74639892578125, + "rewards//std": 0.038351066410541534, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3552, + "grad_norm": 1.8848110437393188, + "kl": 1.794282415881753, + "learning_rate": 7.291132608637052e-07, + "loss": 0.1794, + "num_tokens": 15350712.0, + "reward": 0.7708740234375, + "reward_std": 0.01875242590904236, + "rewards//mean": 0.7708740234375, + "rewards//std": 0.03361161798238754, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3554, + "grad_norm": 3.2355854511260986, + "kl": 1.2206679452210665, + "learning_rate": 7.288311582831077e-07, + "loss": 0.1221, + "num_tokens": 15359416.0, + "reward": 0.75213623046875, + "reward_std": 0.007684601470828056, + "rewards//mean": 0.75213623046875, + "rewards//std": 0.02200845256447792, + "step": 1777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3556, + "grad_norm": 2.0275652408599854, + "kl": 1.738957367837429, + "learning_rate": 7.285489635293471e-07, + "loss": 0.1739, + "num_tokens": 15368040.0, + "reward": 0.7371826171875, + "reward_std": 0.0094405896961689, + "rewards//mean": 0.7371826171875, + "rewards//std": 0.022514790296554565, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3558, + "grad_norm": 2.643275737762451, + "kl": 2.499020282179117, + "learning_rate": 7.282666767160912e-07, + "loss": 0.2499, + "num_tokens": 15376712.0, + "reward": 0.73748779296875, + "reward_std": 0.01339271105825901, + "rewards//mean": 0.73748779296875, + "rewards//std": 0.03584315627813339, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.356, + "grad_norm": 2.4836769104003906, + "kl": 1.1561640910804272, + "learning_rate": 7.279842979570453e-07, + "loss": 0.1156, + "num_tokens": 15385416.0, + "reward": 0.78717041015625, + "reward_std": 0.007983257994055748, + "rewards//mean": 0.78717041015625, + "rewards//std": 0.0198082085698843, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3562, + "grad_norm": 1.1851799488067627, + "kl": 1.1809126678854227, + "learning_rate": 7.277018273659516e-07, + "loss": 0.1181, + "num_tokens": 15394024.0, + "reward": 0.78594970703125, + "reward_std": 0.006664501037448645, + "rewards//mean": 0.78594970703125, + "rewards//std": 0.03354641795158386, + "step": 1781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3564, + "grad_norm": 3.261087656021118, + "kl": 0.7810133509337902, + "learning_rate": 7.274192650565889e-07, + "loss": 0.0781, + "num_tokens": 15402640.0, + "reward": 0.7667236328125, + "reward_std": 0.0022175246849656105, + "rewards//mean": 0.7667236328125, + "rewards//std": 0.0193314291536808, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3566, + "grad_norm": 4.513838291168213, + "kl": 0.8302067779004574, + "learning_rate": 7.271366111427734e-07, + "loss": 0.083, + "num_tokens": 15411344.0, + "reward": 0.7559814453125, + "reward_std": 0.006560072302818298, + "rewards//mean": 0.7559814453125, + "rewards//std": 0.02708090841770172, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3568, + "grad_norm": 3.3704817295074463, + "kl": 1.1860128305852413, + "learning_rate": 7.26853865738358e-07, + "loss": 0.1186, + "num_tokens": 15419992.0, + "reward": 0.7576904296875, + "reward_std": 0.0033324281685054302, + "rewards//mean": 0.7576904296875, + "rewards//std": 0.024488355964422226, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.357, + "grad_norm": 1.4541735649108887, + "kl": 1.462040601298213, + "learning_rate": 7.265710289572328e-07, + "loss": 0.1462, + "num_tokens": 15428584.0, + "reward": 0.7762451171875, + "reward_std": 0.010556678287684917, + "rewards//mean": 0.7762451171875, + "rewards//std": 0.03362782672047615, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3572, + "grad_norm": 3.4049315452575684, + "kl": 1.2772125378251076, + "learning_rate": 7.262881009133241e-07, + "loss": 0.1277, + "num_tokens": 15437216.0, + "reward": 0.75726318359375, + "reward_std": 0.008723619394004345, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.027558742091059685, + "step": 1786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3574, + "grad_norm": 1.7542139291763306, + "kl": 1.4295821785926819, + "learning_rate": 7.260050817205955e-07, + "loss": 0.143, + "num_tokens": 15445824.0, + "reward": 0.75946044921875, + "reward_std": 0.009563788771629333, + "rewards//mean": 0.75946044921875, + "rewards//std": 0.03559650853276253, + "step": 1787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3576, + "grad_norm": 2.8776495456695557, + "kl": 1.384749609977007, + "learning_rate": 7.25721971493047e-07, + "loss": 0.1385, + "num_tokens": 15454392.0, + "reward": 0.75311279296875, + "reward_std": 0.007299318909645081, + "rewards//mean": 0.75311279296875, + "rewards//std": 0.0289082583039999, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3578, + "grad_norm": 6.074490547180176, + "kl": 1.5374908838421106, + "learning_rate": 7.254387703447153e-07, + "loss": 0.1537, + "num_tokens": 15462984.0, + "reward": 0.754150390625, + "reward_std": 0.009968824684619904, + "rewards//mean": 0.754150390625, + "rewards//std": 0.03229343146085739, + "step": 1789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.358, + "grad_norm": 6.639472484588623, + "kl": 2.2980942018330097, + "learning_rate": 7.25155478389674e-07, + "loss": 0.2298, + "num_tokens": 15471616.0, + "reward": 0.751953125, + "reward_std": 0.010655362159013748, + "rewards//mean": 0.751953125, + "rewards//std": 0.03024967759847641, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3582, + "grad_norm": 1.7258334159851074, + "kl": 0.9021014496684074, + "learning_rate": 7.248720957420329e-07, + "loss": 0.0902, + "num_tokens": 15480296.0, + "reward": 0.766845703125, + "reward_std": 0.004705042112618685, + "rewards//mean": 0.766845703125, + "rewards//std": 0.023008590564131737, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3584, + "grad_norm": 2.6910016536712646, + "kl": 1.2031854316592216, + "learning_rate": 7.245886225159386e-07, + "loss": 0.1203, + "num_tokens": 15488912.0, + "reward": 0.70916748046875, + "reward_std": 0.005398493260145187, + "rewards//mean": 0.70916748046875, + "rewards//std": 0.04426497966051102, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3586, + "grad_norm": 5.5270209312438965, + "kl": 1.1625807750970125, + "learning_rate": 7.243050588255737e-07, + "loss": 0.1163, + "num_tokens": 15497536.0, + "reward": 0.77923583984375, + "reward_std": 0.011963524855673313, + "rewards//mean": 0.77923583984375, + "rewards//std": 0.031071167439222336, + "step": 1793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3588, + "grad_norm": 9.999727249145508, + "kl": 2.41350987367332, + "learning_rate": 7.240214047851581e-07, + "loss": 0.2414, + "num_tokens": 15506176.0, + "reward": 0.73736572265625, + "reward_std": 0.008081833831965923, + "rewards//mean": 0.73736572265625, + "rewards//std": 0.03701702132821083, + "step": 1794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.359, + "grad_norm": 1.8980357646942139, + "kl": 1.4534629695117474, + "learning_rate": 7.237376605089476e-07, + "loss": 0.1453, + "num_tokens": 15514840.0, + "reward": 0.76812744140625, + "reward_std": 0.00910151470452547, + "rewards//mean": 0.76812744140625, + "rewards//std": 0.02597058191895485, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3592, + "grad_norm": 2.3029587268829346, + "kl": 1.6458844356238842, + "learning_rate": 7.234538261112341e-07, + "loss": 0.1646, + "num_tokens": 15523432.0, + "reward": 0.75567626953125, + "reward_std": 0.01055966503918171, + "rewards//mean": 0.75567626953125, + "rewards//std": 0.02400852181017399, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3594, + "grad_norm": 3.215376377105713, + "kl": 1.1200422495603561, + "learning_rate": 7.23169901706346e-07, + "loss": 0.112, + "num_tokens": 15532040.0, + "reward": 0.736572265625, + "reward_std": 0.007564832456409931, + "rewards//mean": 0.736572265625, + "rewards//std": 0.03307156264781952, + "step": 1797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3596, + "grad_norm": 2.5984058380126953, + "kl": 1.0757994446903467, + "learning_rate": 7.228858874086484e-07, + "loss": 0.1076, + "num_tokens": 15540624.0, + "reward": 0.76824951171875, + "reward_std": 0.004476871341466904, + "rewards//mean": 0.76824951171875, + "rewards//std": 0.01767677441239357, + "step": 1798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3598, + "grad_norm": 3.9091780185699463, + "kl": 1.3355560693889856, + "learning_rate": 7.226017833325419e-07, + "loss": 0.1336, + "num_tokens": 15549200.0, + "reward": 0.7333984375, + "reward_std": 0.00962809193879366, + "rewards//mean": 0.7333984375, + "rewards//std": 0.029413418844342232, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.36, + "grad_norm": 8.66907024383545, + "kl": 0.9297815319150686, + "learning_rate": 7.223175895924637e-07, + "loss": 0.093, + "num_tokens": 15557776.0, + "reward": 0.78564453125, + "reward_std": 0.004353870637714863, + "rewards//mean": 0.78564453125, + "rewards//std": 0.022450489923357964, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3602, + "grad_norm": 2.6104724407196045, + "kl": 2.3057056684046984, + "learning_rate": 7.220333063028871e-07, + "loss": 0.2306, + "num_tokens": 15566328.0, + "reward": 0.73406982421875, + "reward_std": 0.016842670738697052, + "rewards//mean": 0.73406982421875, + "rewards//std": 0.030948638916015625, + "step": 1801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3604, + "grad_norm": 5.116308689117432, + "kl": 1.9751359019428492, + "learning_rate": 7.217489335783211e-07, + "loss": 0.1975, + "num_tokens": 15575120.0, + "reward": 0.80010986328125, + "reward_std": 0.016279060393571854, + "rewards//mean": 0.80010986328125, + "rewards//std": 0.035129059106111526, + "step": 1802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3606, + "grad_norm": 1.5109270811080933, + "kl": 0.8686944153159857, + "learning_rate": 7.214644715333114e-07, + "loss": 0.0869, + "num_tokens": 15583712.0, + "reward": 0.7593994140625, + "reward_std": 0.0061800191178917885, + "rewards//mean": 0.7593994140625, + "rewards//std": 0.02092287689447403, + "step": 1803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3608, + "grad_norm": 5.090044975280762, + "kl": 1.8055401183664799, + "learning_rate": 7.211799202824388e-07, + "loss": 0.1806, + "num_tokens": 15592344.0, + "reward": 0.76202392578125, + "reward_std": 0.014151292853057384, + "rewards//mean": 0.76202392578125, + "rewards//std": 0.030499298125505447, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.361, + "grad_norm": 2.3456919193267822, + "kl": 1.314583534374833, + "learning_rate": 7.20895279940321e-07, + "loss": 0.1315, + "num_tokens": 15600928.0, + "reward": 0.7452392578125, + "reward_std": 0.0053898547776043415, + "rewards//mean": 0.7452392578125, + "rewards//std": 0.02513747289776802, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3612, + "grad_norm": 1.9758235216140747, + "kl": 0.7922122180461884, + "learning_rate": 7.206105506216106e-07, + "loss": 0.0792, + "num_tokens": 15609536.0, + "reward": 0.76885986328125, + "reward_std": 0.007231271825730801, + "rewards//mean": 0.76885986328125, + "rewards//std": 0.02681654877960682, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3614, + "grad_norm": 4.113847255706787, + "kl": 1.960466867312789, + "learning_rate": 7.203257324409971e-07, + "loss": 0.196, + "num_tokens": 15618136.0, + "reward": 0.75701904296875, + "reward_std": 0.013052722439169884, + "rewards//mean": 0.75701904296875, + "rewards//std": 0.03603186458349228, + "step": 1807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3616, + "grad_norm": 2.9570446014404297, + "kl": 1.1656327359378338, + "learning_rate": 7.200408255132045e-07, + "loss": 0.1166, + "num_tokens": 15626680.0, + "reward": 0.74884033203125, + "reward_std": 0.004135049879550934, + "rewards//mean": 0.74884033203125, + "rewards//std": 0.01934659481048584, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3618, + "grad_norm": 1.1830683946609497, + "kl": 0.6443019825965166, + "learning_rate": 7.19755829952994e-07, + "loss": 0.0644, + "num_tokens": 15635376.0, + "reward": 0.791748046875, + "reward_std": 0.0024971095845103264, + "rewards//mean": 0.791748046875, + "rewards//std": 0.016702014952898026, + "step": 1809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.362, + "grad_norm": 3.4049994945526123, + "kl": 1.9126994479447603, + "learning_rate": 7.194707458751615e-07, + "loss": 0.1913, + "num_tokens": 15644080.0, + "reward": 0.75152587890625, + "reward_std": 0.010969490744173527, + "rewards//mean": 0.75152587890625, + "rewards//std": 0.029697103425860405, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3622, + "grad_norm": 9.74229907989502, + "kl": 3.112483039498329, + "learning_rate": 7.191855733945386e-07, + "loss": 0.3112, + "num_tokens": 15652968.0, + "reward": 0.73614501953125, + "reward_std": 0.013483827002346516, + "rewards//mean": 0.73614501953125, + "rewards//std": 0.04093911126255989, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3624, + "grad_norm": 1.8979837894439697, + "kl": 1.671231472864747, + "learning_rate": 7.189003126259931e-07, + "loss": 0.1671, + "num_tokens": 15661712.0, + "reward": 0.7601318359375, + "reward_std": 0.01090779434889555, + "rewards//mean": 0.7601318359375, + "rewards//std": 0.026011131703853607, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3626, + "grad_norm": 1.5321110486984253, + "kl": 0.9612350445240736, + "learning_rate": 7.186149636844279e-07, + "loss": 0.0961, + "num_tokens": 15670432.0, + "reward": 0.77783203125, + "reward_std": 0.0065762861631810665, + "rewards//mean": 0.77783203125, + "rewards//std": 0.02333912067115307, + "step": 1813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3628, + "grad_norm": 4.263502597808838, + "kl": 1.7631815448403358, + "learning_rate": 7.183295266847814e-07, + "loss": 0.1763, + "num_tokens": 15679056.0, + "reward": 0.75384521484375, + "reward_std": 0.013981279917061329, + "rewards//mean": 0.75384521484375, + "rewards//std": 0.023276690393686295, + "step": 1814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.363, + "grad_norm": 2.405259370803833, + "kl": 1.0479095336049795, + "learning_rate": 7.180440017420276e-07, + "loss": 0.1048, + "num_tokens": 15687624.0, + "reward": 0.7518310546875, + "reward_std": 0.005571077577769756, + "rewards//mean": 0.7518310546875, + "rewards//std": 0.029330700635910034, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3632, + "grad_norm": 6.763623237609863, + "kl": 1.44739506021142, + "learning_rate": 7.177583889711762e-07, + "loss": 0.1447, + "num_tokens": 15696272.0, + "reward": 0.77642822265625, + "reward_std": 0.003670833073556423, + "rewards//mean": 0.77642822265625, + "rewards//std": 0.03301194682717323, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3634, + "grad_norm": 0.9895033240318298, + "kl": 1.0269442293792963, + "learning_rate": 7.174726884872715e-07, + "loss": 0.1027, + "num_tokens": 15704840.0, + "reward": 0.75531005859375, + "reward_std": 0.004540668334811926, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.029243025928735733, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3636, + "grad_norm": 3.1822571754455566, + "kl": 2.347819235175848, + "learning_rate": 7.17186900405394e-07, + "loss": 0.2348, + "num_tokens": 15713432.0, + "reward": 0.75091552734375, + "reward_std": 0.021689504384994507, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.035199228674173355, + "step": 1818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3638, + "grad_norm": 1.3811274766921997, + "kl": 0.5692074857652187, + "learning_rate": 7.169010248406588e-07, + "loss": 0.0569, + "num_tokens": 15722048.0, + "reward": 0.75347900390625, + "reward_std": 0.0030401148833334446, + "rewards//mean": 0.75347900390625, + "rewards//std": 0.0257528368383646, + "step": 1819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.364, + "grad_norm": 1.770247459411621, + "kl": 1.1378947533667088, + "learning_rate": 7.16615061908217e-07, + "loss": 0.1138, + "num_tokens": 15730736.0, + "reward": 0.739501953125, + "reward_std": 0.005134006962180138, + "rewards//mean": 0.739501953125, + "rewards//std": 0.04060612618923187, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3642, + "grad_norm": 1.3821980953216553, + "kl": 0.6415708940476179, + "learning_rate": 7.163290117232541e-07, + "loss": 0.0642, + "num_tokens": 15739304.0, + "reward": 0.76324462890625, + "reward_std": 0.0005179004510864615, + "rewards//mean": 0.76324462890625, + "rewards//std": 0.021277552470564842, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3644, + "grad_norm": 19.419260025024414, + "kl": 1.0268391743302345, + "learning_rate": 7.160428744009912e-07, + "loss": 0.1027, + "num_tokens": 15747912.0, + "reward": 0.7359619140625, + "reward_std": 0.008618786931037903, + "rewards//mean": 0.7359619140625, + "rewards//std": 0.026609499007463455, + "step": 1822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3646, + "grad_norm": 3.62929368019104, + "kl": 2.081236759200692, + "learning_rate": 7.157566500566842e-07, + "loss": 0.2081, + "num_tokens": 15756560.0, + "reward": 0.78173828125, + "reward_std": 0.01659890078008175, + "rewards//mean": 0.78173828125, + "rewards//std": 0.028961164876818657, + "step": 1823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3648, + "grad_norm": 4.564011573791504, + "kl": 1.8572072703391314, + "learning_rate": 7.154703388056244e-07, + "loss": 0.1857, + "num_tokens": 15765136.0, + "reward": 0.7550048828125, + "reward_std": 0.009905409999191761, + "rewards//mean": 0.7550048828125, + "rewards//std": 0.028412101790308952, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.365, + "grad_norm": 2.2224252223968506, + "kl": 1.4804861135780811, + "learning_rate": 7.15183940763138e-07, + "loss": 0.148, + "num_tokens": 15773752.0, + "reward": 0.76629638671875, + "reward_std": 0.010908014141023159, + "rewards//mean": 0.76629638671875, + "rewards//std": 0.03380085900425911, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3652, + "grad_norm": 5.550025463104248, + "kl": 1.1781852692365646, + "learning_rate": 7.148974560445858e-07, + "loss": 0.1178, + "num_tokens": 15782448.0, + "reward": 0.80841064453125, + "reward_std": 0.010814267210662365, + "rewards//mean": 0.80841064453125, + "rewards//std": 0.027869535610079765, + "step": 1826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3654, + "grad_norm": 5.150213241577148, + "kl": 2.069683153182268, + "learning_rate": 7.146108847653641e-07, + "loss": 0.207, + "num_tokens": 15791048.0, + "reward": 0.74420166015625, + "reward_std": 0.011490346863865852, + "rewards//mean": 0.74420166015625, + "rewards//std": 0.02910708449780941, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3656, + "grad_norm": 3.5626261234283447, + "kl": 1.5172289572656155, + "learning_rate": 7.143242270409037e-07, + "loss": 0.1517, + "num_tokens": 15799808.0, + "reward": 0.74676513671875, + "reward_std": 0.012836494483053684, + "rewards//mean": 0.74676513671875, + "rewards//std": 0.04204024001955986, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3658, + "grad_norm": 2.1888387203216553, + "kl": 0.7403810862451792, + "learning_rate": 7.140374829866702e-07, + "loss": 0.074, + "num_tokens": 15808392.0, + "reward": 0.73809814453125, + "reward_std": 0.005442460998892784, + "rewards//mean": 0.73809814453125, + "rewards//std": 0.022309036925435066, + "step": 1829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.366, + "grad_norm": 1.5039739608764648, + "kl": 1.6122042424976826, + "learning_rate": 7.137506527181643e-07, + "loss": 0.1612, + "num_tokens": 15817048.0, + "reward": 0.751220703125, + "reward_std": 0.006968109868466854, + "rewards//mean": 0.751220703125, + "rewards//std": 0.032710738480091095, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3662, + "grad_norm": 4.752030372619629, + "kl": 1.8018863126635551, + "learning_rate": 7.134637363509209e-07, + "loss": 0.1802, + "num_tokens": 15825664.0, + "reward": 0.75128173828125, + "reward_std": 0.008976714685559273, + "rewards//mean": 0.75128173828125, + "rewards//std": 0.027796657755970955, + "step": 1831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3664, + "grad_norm": 2.9698989391326904, + "kl": 1.3148400988429785, + "learning_rate": 7.131767340005101e-07, + "loss": 0.1315, + "num_tokens": 15834296.0, + "reward": 0.72882080078125, + "reward_std": 0.007888147607445717, + "rewards//mean": 0.72882080078125, + "rewards//std": 0.02841496467590332, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3666, + "grad_norm": 2.261000633239746, + "kl": 1.148534793406725, + "learning_rate": 7.128896457825363e-07, + "loss": 0.1149, + "num_tokens": 15842992.0, + "reward": 0.77691650390625, + "reward_std": 0.011105703189969063, + "rewards//mean": 0.77691650390625, + "rewards//std": 0.03385500609874725, + "step": 1833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3668, + "grad_norm": 10.06489086151123, + "kl": 1.1229076366871595, + "learning_rate": 7.126024718126387e-07, + "loss": 0.1123, + "num_tokens": 15851704.0, + "reward": 0.75543212890625, + "reward_std": 0.0066683851182460785, + "rewards//mean": 0.75543212890625, + "rewards//std": 0.026654046028852463, + "step": 1834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.367, + "grad_norm": 2.543853759765625, + "kl": 0.6991872619837523, + "learning_rate": 7.123152122064908e-07, + "loss": 0.0699, + "num_tokens": 15860272.0, + "reward": 0.77618408203125, + "reward_std": 0.0065511795692145824, + "rewards//mean": 0.77618408203125, + "rewards//std": 0.024136804044246674, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3672, + "grad_norm": 3.3016326427459717, + "kl": 1.433420417830348, + "learning_rate": 7.120278670798009e-07, + "loss": 0.1433, + "num_tokens": 15868880.0, + "reward": 0.76513671875, + "reward_std": 0.014103761874139309, + "rewards//mean": 0.76513671875, + "rewards//std": 0.03648821637034416, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3674, + "grad_norm": 15.681102752685547, + "kl": 2.7154130339622498, + "learning_rate": 7.117404365483115e-07, + "loss": 0.2715, + "num_tokens": 15877640.0, + "reward": 0.7822265625, + "reward_std": 0.010400941595435143, + "rewards//mean": 0.7822265625, + "rewards//std": 0.03711019083857536, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3676, + "grad_norm": 2.0528388023376465, + "kl": 0.8739600479602814, + "learning_rate": 7.114529207277995e-07, + "loss": 0.0874, + "num_tokens": 15886312.0, + "reward": 0.75274658203125, + "reward_std": 0.004914752207696438, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.025052646175026894, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3678, + "grad_norm": 0.9327261447906494, + "kl": 0.6090308651328087, + "learning_rate": 7.111653197340764e-07, + "loss": 0.0609, + "num_tokens": 15894920.0, + "reward": 0.73419189453125, + "reward_std": 0.002529338002204895, + "rewards//mean": 0.73419189453125, + "rewards//std": 0.026267895475029945, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.368, + "grad_norm": 2.3686490058898926, + "kl": 0.9842582866549492, + "learning_rate": 7.108776336829876e-07, + "loss": 0.0984, + "num_tokens": 15903472.0, + "reward": 0.77490234375, + "reward_std": 0.00263409037142992, + "rewards//mean": 0.77490234375, + "rewards//std": 0.01799515075981617, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3682, + "grad_norm": 16.48367691040039, + "kl": 1.4165671542286873, + "learning_rate": 7.105898626904134e-07, + "loss": 0.1417, + "num_tokens": 15912104.0, + "reward": 0.79345703125, + "reward_std": 0.010839138180017471, + "rewards//mean": 0.79345703125, + "rewards//std": 0.022146357223391533, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3684, + "grad_norm": 19.49656105041504, + "kl": 1.8708918560296297, + "learning_rate": 7.103020068722674e-07, + "loss": 0.1871, + "num_tokens": 15920808.0, + "reward": 0.7288818359375, + "reward_std": 0.011415512301027775, + "rewards//mean": 0.7288818359375, + "rewards//std": 0.03764619305729866, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3686, + "grad_norm": 2.1908531188964844, + "kl": 2.2299514431506395, + "learning_rate": 7.100140663444984e-07, + "loss": 0.223, + "num_tokens": 15929496.0, + "reward": 0.74114990234375, + "reward_std": 0.01153961569070816, + "rewards//mean": 0.74114990234375, + "rewards//std": 0.0386400930583477, + "step": 1843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3688, + "grad_norm": 1.8744713068008423, + "kl": 1.1681729834526777, + "learning_rate": 7.097260412230885e-07, + "loss": 0.1168, + "num_tokens": 15938136.0, + "reward": 0.755126953125, + "reward_std": 0.00805087760090828, + "rewards//mean": 0.755126953125, + "rewards//std": 0.02592865191400051, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.369, + "grad_norm": 3.0080108642578125, + "kl": 1.4773118402808905, + "learning_rate": 7.094379316240544e-07, + "loss": 0.1477, + "num_tokens": 15946768.0, + "reward": 0.73944091796875, + "reward_std": 0.005808740388602018, + "rewards//mean": 0.73944091796875, + "rewards//std": 0.02850751020014286, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3692, + "grad_norm": 3.1908085346221924, + "kl": 1.072961589321494, + "learning_rate": 7.091497376634463e-07, + "loss": 0.1073, + "num_tokens": 15955440.0, + "reward": 0.7869873046875, + "reward_std": 0.007595873903483152, + "rewards//mean": 0.7869873046875, + "rewards//std": 0.028713131323456764, + "step": 1846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3694, + "grad_norm": 2.3255116939544678, + "kl": 1.105245677754283, + "learning_rate": 7.088614594573491e-07, + "loss": 0.1105, + "num_tokens": 15964048.0, + "reward": 0.76324462890625, + "reward_std": 0.008902833797037601, + "rewards//mean": 0.76324462890625, + "rewards//std": 0.034115131944417953, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3696, + "grad_norm": 2.5090808868408203, + "kl": 1.824024187400937, + "learning_rate": 7.085730971218809e-07, + "loss": 0.1824, + "num_tokens": 15972664.0, + "reward": 0.7740478515625, + "reward_std": 0.01558766234666109, + "rewards//mean": 0.7740478515625, + "rewards//std": 0.025725562125444412, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3698, + "grad_norm": 1.9827549457550049, + "kl": 1.789289928972721, + "learning_rate": 7.082846507731941e-07, + "loss": 0.1789, + "num_tokens": 15981304.0, + "reward": 0.73907470703125, + "reward_std": 0.012560350820422173, + "rewards//mean": 0.73907470703125, + "rewards//std": 0.031850166618824005, + "step": 1849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.37, + "grad_norm": 20.470462799072266, + "kl": 2.968334635719657, + "learning_rate": 7.079961205274748e-07, + "loss": 0.2968, + "num_tokens": 15989976.0, + "reward": 0.78729248046875, + "reward_std": 0.020587624981999397, + "rewards//mean": 0.78729248046875, + "rewards//std": 0.035908978432416916, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3702, + "grad_norm": 2.3084218502044678, + "kl": 1.7427243553102016, + "learning_rate": 7.077075065009433e-07, + "loss": 0.1743, + "num_tokens": 15998608.0, + "reward": 0.788330078125, + "reward_std": 0.009603803977370262, + "rewards//mean": 0.788330078125, + "rewards//std": 0.035131268203258514, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3704, + "grad_norm": 6.136078834533691, + "kl": 1.7758771125227213, + "learning_rate": 7.074188088098527e-07, + "loss": 0.1776, + "num_tokens": 16007184.0, + "reward": 0.7642822265625, + "reward_std": 0.009786337614059448, + "rewards//mean": 0.7642822265625, + "rewards//std": 0.03215320408344269, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3706, + "grad_norm": 1.8461768627166748, + "kl": 1.7065285798162222, + "learning_rate": 7.071300275704909e-07, + "loss": 0.1707, + "num_tokens": 16015936.0, + "reward": 0.7442626953125, + "reward_std": 0.009451361373066902, + "rewards//mean": 0.7442626953125, + "rewards//std": 0.025094076991081238, + "step": 1853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3708, + "grad_norm": 8.691837310791016, + "kl": 1.6497365441173315, + "learning_rate": 7.068411628991787e-07, + "loss": 0.165, + "num_tokens": 16024664.0, + "reward": 0.76043701171875, + "reward_std": 0.0033445670269429684, + "rewards//mean": 0.76043701171875, + "rewards//std": 0.037383656948804855, + "step": 1854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.371, + "grad_norm": 1.724465250968933, + "kl": 0.8312657419592142, + "learning_rate": 7.065522149122709e-07, + "loss": 0.0831, + "num_tokens": 16033264.0, + "reward": 0.731689453125, + "reward_std": 0.005383252166211605, + "rewards//mean": 0.731689453125, + "rewards//std": 0.029967118054628372, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3712, + "grad_norm": 1.1823596954345703, + "kl": 0.9029164835810661, + "learning_rate": 7.062631837261556e-07, + "loss": 0.0903, + "num_tokens": 16041776.0, + "reward": 0.79107666015625, + "reward_std": 0.005362005904316902, + "rewards//mean": 0.79107666015625, + "rewards//std": 0.021327294409275055, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3714, + "grad_norm": 0.8451175093650818, + "kl": 1.0803232304751873, + "learning_rate": 7.059740694572545e-07, + "loss": 0.108, + "num_tokens": 16050400.0, + "reward": 0.744140625, + "reward_std": 0.003940070513635874, + "rewards//mean": 0.744140625, + "rewards//std": 0.020564204081892967, + "step": 1857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3716, + "grad_norm": 3.1342833042144775, + "kl": 2.3297573048621416, + "learning_rate": 7.056848722220228e-07, + "loss": 0.233, + "num_tokens": 16059080.0, + "reward": 0.7744140625, + "reward_std": 0.013841914013028145, + "rewards//mean": 0.7744140625, + "rewards//std": 0.029048843309283257, + "step": 1858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3718, + "grad_norm": 1.7096662521362305, + "kl": 0.774777602404356, + "learning_rate": 7.053955921369493e-07, + "loss": 0.0775, + "num_tokens": 16067680.0, + "reward": 0.7767333984375, + "reward_std": 0.0027954974211752415, + "rewards//mean": 0.7767333984375, + "rewards//std": 0.018897438421845436, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.372, + "grad_norm": 5.2674994468688965, + "kl": 1.6537601090967655, + "learning_rate": 7.051062293185559e-07, + "loss": 0.1654, + "num_tokens": 16076320.0, + "reward": 0.7572021484375, + "reward_std": 0.014470890164375305, + "rewards//mean": 0.7572021484375, + "rewards//std": 0.036306675523519516, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3722, + "grad_norm": 2.3366708755493164, + "kl": 1.874405587092042, + "learning_rate": 7.048167838833976e-07, + "loss": 0.1874, + "num_tokens": 16084880.0, + "reward": 0.74871826171875, + "reward_std": 0.00786502193659544, + "rewards//mean": 0.74871826171875, + "rewards//std": 0.027604296803474426, + "step": 1861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3724, + "grad_norm": 4.2101545333862305, + "kl": 1.4288507923483849, + "learning_rate": 7.045272559480635e-07, + "loss": 0.1429, + "num_tokens": 16093472.0, + "reward": 0.743896484375, + "reward_std": 0.01189388521015644, + "rewards//mean": 0.743896484375, + "rewards//std": 0.02821025624871254, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3726, + "grad_norm": 6.675228118896484, + "kl": 2.042024416849017, + "learning_rate": 7.042376456291751e-07, + "loss": 0.2042, + "num_tokens": 16102144.0, + "reward": 0.7470703125, + "reward_std": 0.008256803266704082, + "rewards//mean": 0.7470703125, + "rewards//std": 0.0385444313287735, + "step": 1863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3728, + "grad_norm": 2.311246871948242, + "kl": 1.6616391614079475, + "learning_rate": 7.039479530433874e-07, + "loss": 0.1662, + "num_tokens": 16110744.0, + "reward": 0.7674560546875, + "reward_std": 0.009082126431167126, + "rewards//mean": 0.7674560546875, + "rewards//std": 0.02785683609545231, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.373, + "grad_norm": 2.56750226020813, + "kl": 1.7200164832174778, + "learning_rate": 7.036581783073887e-07, + "loss": 0.172, + "num_tokens": 16119360.0, + "reward": 0.73394775390625, + "reward_std": 0.010016953572630882, + "rewards//mean": 0.73394775390625, + "rewards//std": 0.033682867884635925, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3732, + "grad_norm": 9.25973129272461, + "kl": 1.2967899721115828, + "learning_rate": 7.033683215379002e-07, + "loss": 0.1297, + "num_tokens": 16128112.0, + "reward": 0.7705078125, + "reward_std": 0.007930691353976727, + "rewards//mean": 0.7705078125, + "rewards//std": 0.030480990186333656, + "step": 1866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3734, + "grad_norm": 5.735073089599609, + "kl": 1.5535210128873587, + "learning_rate": 7.030783828516759e-07, + "loss": 0.1554, + "num_tokens": 16136808.0, + "reward": 0.7532958984375, + "reward_std": 0.007747113239020109, + "rewards//mean": 0.7532958984375, + "rewards//std": 0.027270298451185226, + "step": 1867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3736, + "grad_norm": 5.924224376678467, + "kl": 2.225294578820467, + "learning_rate": 7.027883623655034e-07, + "loss": 0.2225, + "num_tokens": 16145360.0, + "reward": 0.76019287109375, + "reward_std": 0.011357331648468971, + "rewards//mean": 0.76019287109375, + "rewards//std": 0.02892867475748062, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3738, + "grad_norm": 7.374082565307617, + "kl": 1.942330228164792, + "learning_rate": 7.024982601962026e-07, + "loss": 0.1942, + "num_tokens": 16154056.0, + "reward": 0.75714111328125, + "reward_std": 0.005384392105042934, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.023994645103812218, + "step": 1869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.374, + "grad_norm": 3.31447696685791, + "kl": 1.5992409456521273, + "learning_rate": 7.022080764606271e-07, + "loss": 0.1599, + "num_tokens": 16162616.0, + "reward": 0.73419189453125, + "reward_std": 0.012074579484760761, + "rewards//mean": 0.73419189453125, + "rewards//std": 0.03620119392871857, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3742, + "grad_norm": 3.434015989303589, + "kl": 2.0065502747893333, + "learning_rate": 7.019178112756625e-07, + "loss": 0.2007, + "num_tokens": 16171304.0, + "reward": 0.7576904296875, + "reward_std": 0.012057574465870857, + "rewards//mean": 0.7576904296875, + "rewards//std": 0.03374645859003067, + "step": 1871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3744, + "grad_norm": 1.9910322427749634, + "kl": 0.966529119759798, + "learning_rate": 7.016274647582276e-07, + "loss": 0.0967, + "num_tokens": 16179984.0, + "reward": 0.78094482421875, + "reward_std": 0.003195766592398286, + "rewards//mean": 0.78094482421875, + "rewards//std": 0.02564150094985962, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3746, + "grad_norm": 3.2058041095733643, + "kl": 1.9518991596996784, + "learning_rate": 7.013370370252739e-07, + "loss": 0.1952, + "num_tokens": 16188520.0, + "reward": 0.77301025390625, + "reward_std": 0.00646575540304184, + "rewards//mean": 0.77301025390625, + "rewards//std": 0.01578104868531227, + "step": 1873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3748, + "grad_norm": 11.18813705444336, + "kl": 1.291033249348402, + "learning_rate": 7.010465281937858e-07, + "loss": 0.1291, + "num_tokens": 16197088.0, + "reward": 0.78656005859375, + "reward_std": 0.007830414921045303, + "rewards//mean": 0.78656005859375, + "rewards//std": 0.019861631095409393, + "step": 1874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.375, + "grad_norm": 47.68659973144531, + "kl": 3.9973225481808186, + "learning_rate": 7.007559383807802e-07, + "loss": 0.3997, + "num_tokens": 16205768.0, + "reward": 0.74432373046875, + "reward_std": 0.013239345513284206, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.03863813355565071, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3752, + "grad_norm": 22.866840362548828, + "kl": 1.9784103631973267, + "learning_rate": 7.004652677033068e-07, + "loss": 0.1978, + "num_tokens": 16214376.0, + "reward": 0.760009765625, + "reward_std": 0.014567185193300247, + "rewards//mean": 0.760009765625, + "rewards//std": 0.025561751797795296, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3754, + "grad_norm": 13.356947898864746, + "kl": 2.0567911826074123, + "learning_rate": 7.001745162784475e-07, + "loss": 0.2057, + "num_tokens": 16222952.0, + "reward": 0.7266845703125, + "reward_std": 0.004197565373033285, + "rewards//mean": 0.7266845703125, + "rewards//std": 0.021451586857438087, + "step": 1877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3756, + "grad_norm": 8.387414932250977, + "kl": 1.653434380888939, + "learning_rate": 6.998836842233169e-07, + "loss": 0.1653, + "num_tokens": 16231624.0, + "reward": 0.77984619140625, + "reward_std": 0.008104367181658745, + "rewards//mean": 0.77984619140625, + "rewards//std": 0.03065030463039875, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3758, + "grad_norm": 1.5858031511306763, + "kl": 0.946024801582098, + "learning_rate": 6.995927716550622e-07, + "loss": 0.0946, + "num_tokens": 16240272.0, + "reward": 0.7742919921875, + "reward_std": 0.007378405425697565, + "rewards//mean": 0.7742919921875, + "rewards//std": 0.023882482200860977, + "step": 1879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.376, + "grad_norm": 2.5536181926727295, + "kl": 1.0827032681554556, + "learning_rate": 6.99301778690863e-07, + "loss": 0.1083, + "num_tokens": 16248920.0, + "reward": 0.7535400390625, + "reward_std": 0.007021937519311905, + "rewards//mean": 0.7535400390625, + "rewards//std": 0.030062692239880562, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3762, + "grad_norm": 13.058320045471191, + "kl": 1.2034966554492712, + "learning_rate": 6.990107054479312e-07, + "loss": 0.1203, + "num_tokens": 16257576.0, + "reward": 0.763427734375, + "reward_std": 0.005399170331656933, + "rewards//mean": 0.763427734375, + "rewards//std": 0.03184785321354866, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3764, + "grad_norm": 0.7369622588157654, + "kl": 0.800319992005825, + "learning_rate": 6.987195520435109e-07, + "loss": 0.08, + "num_tokens": 16266248.0, + "reward": 0.73876953125, + "reward_std": 0.0027755668852478266, + "rewards//mean": 0.73876953125, + "rewards//std": 0.02507023885846138, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3766, + "grad_norm": 1.3183344602584839, + "kl": 0.9885077476501465, + "learning_rate": 6.984283185948789e-07, + "loss": 0.0989, + "num_tokens": 16274880.0, + "reward": 0.76861572265625, + "reward_std": 0.006070063915103674, + "rewards//mean": 0.76861572265625, + "rewards//std": 0.03312912955880165, + "step": 1883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3768, + "grad_norm": 12.471165657043457, + "kl": 1.6292383763939142, + "learning_rate": 6.981370052193439e-07, + "loss": 0.1629, + "num_tokens": 16283720.0, + "reward": 0.77667236328125, + "reward_std": 0.009694737382233143, + "rewards//mean": 0.77667236328125, + "rewards//std": 0.031262047588825226, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.377, + "grad_norm": 2.3145735263824463, + "kl": 1.2088876217603683, + "learning_rate": 6.978456120342469e-07, + "loss": 0.1209, + "num_tokens": 16292424.0, + "reward": 0.783447265625, + "reward_std": 0.008539421483874321, + "rewards//mean": 0.783447265625, + "rewards//std": 0.02722279727458954, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3772, + "grad_norm": 12.633748054504395, + "kl": 1.3388360384851694, + "learning_rate": 6.975541391569609e-07, + "loss": 0.1339, + "num_tokens": 16301016.0, + "reward": 0.75921630859375, + "reward_std": 0.012317837215960026, + "rewards//mean": 0.75921630859375, + "rewards//std": 0.03363429754972458, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3774, + "grad_norm": 7.928569793701172, + "kl": 1.8477709367871284, + "learning_rate": 6.972625867048914e-07, + "loss": 0.1848, + "num_tokens": 16309584.0, + "reward": 0.755859375, + "reward_std": 0.009783722460269928, + "rewards//mean": 0.755859375, + "rewards//std": 0.035495635122060776, + "step": 1887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3776, + "grad_norm": 7.905439376831055, + "kl": 1.8555166963487864, + "learning_rate": 6.969709547954755e-07, + "loss": 0.1856, + "num_tokens": 16318192.0, + "reward": 0.77197265625, + "reward_std": 0.009541463106870651, + "rewards//mean": 0.77197265625, + "rewards//std": 0.03386489674448967, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3778, + "grad_norm": 1.6442062854766846, + "kl": 0.5479441192001104, + "learning_rate": 6.966792435461826e-07, + "loss": 0.0548, + "num_tokens": 16326832.0, + "reward": 0.7723388671875, + "reward_std": 0.0031074027065187693, + "rewards//mean": 0.7723388671875, + "rewards//std": 0.023077895864844322, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.378, + "grad_norm": 7.154458999633789, + "kl": 1.1153011620044708, + "learning_rate": 6.963874530745139e-07, + "loss": 0.1115, + "num_tokens": 16335400.0, + "reward": 0.740966796875, + "reward_std": 0.009196332655847073, + "rewards//mean": 0.740966796875, + "rewards//std": 0.02685553953051567, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3782, + "grad_norm": 2.1894285678863525, + "kl": 1.2471293229609728, + "learning_rate": 6.960955834980027e-07, + "loss": 0.1247, + "num_tokens": 16344080.0, + "reward": 0.74151611328125, + "reward_std": 0.006976440083235502, + "rewards//mean": 0.74151611328125, + "rewards//std": 0.02658979222178459, + "step": 1891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3784, + "grad_norm": 12.276518821716309, + "kl": 3.3270397186279297, + "learning_rate": 6.958036349342139e-07, + "loss": 0.3327, + "num_tokens": 16352640.0, + "reward": 0.7413330078125, + "reward_std": 0.016672108322381973, + "rewards//mean": 0.7413330078125, + "rewards//std": 0.03904059901833534, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3786, + "grad_norm": 2.6406450271606445, + "kl": 0.9585860967636108, + "learning_rate": 6.955116075007442e-07, + "loss": 0.0959, + "num_tokens": 16361280.0, + "reward": 0.76934814453125, + "reward_std": 0.009336546994745731, + "rewards//mean": 0.76934814453125, + "rewards//std": 0.02935926616191864, + "step": 1893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3788, + "grad_norm": 7.74597692489624, + "kl": 1.2267329227179289, + "learning_rate": 6.952195013152225e-07, + "loss": 0.1227, + "num_tokens": 16369872.0, + "reward": 0.764892578125, + "reward_std": 0.008651645854115486, + "rewards//mean": 0.764892578125, + "rewards//std": 0.025590162724256516, + "step": 1894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.379, + "grad_norm": 1.802874207496643, + "kl": 0.8312840610742569, + "learning_rate": 6.94927316495309e-07, + "loss": 0.0831, + "num_tokens": 16378512.0, + "reward": 0.77117919921875, + "reward_std": 0.006767125800251961, + "rewards//mean": 0.77117919921875, + "rewards//std": 0.019172104075551033, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3792, + "grad_norm": 2.32957124710083, + "kl": 1.7574926782399416, + "learning_rate": 6.946350531586957e-07, + "loss": 0.1757, + "num_tokens": 16387160.0, + "reward": 0.75518798828125, + "reward_std": 0.011470545083284378, + "rewards//mean": 0.75518798828125, + "rewards//std": 0.03320443630218506, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3794, + "grad_norm": 5.658195495605469, + "kl": 2.0864341594278812, + "learning_rate": 6.943427114231063e-07, + "loss": 0.2086, + "num_tokens": 16395784.0, + "reward": 0.76605224609375, + "reward_std": 0.008342149667441845, + "rewards//mean": 0.76605224609375, + "rewards//std": 0.015595788136124611, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3796, + "grad_norm": 1.0706034898757935, + "kl": 0.6285277456045151, + "learning_rate": 6.94050291406296e-07, + "loss": 0.0629, + "num_tokens": 16404432.0, + "reward": 0.7647705078125, + "reward_std": 0.0038165440782904625, + "rewards//mean": 0.7647705078125, + "rewards//std": 0.025774944573640823, + "step": 1898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3798, + "grad_norm": 2.2915685176849365, + "kl": 1.2932017892599106, + "learning_rate": 6.937577932260514e-07, + "loss": 0.1293, + "num_tokens": 16413160.0, + "reward": 0.76568603515625, + "reward_std": 0.00904356874525547, + "rewards//mean": 0.76568603515625, + "rewards//std": 0.02796551026403904, + "step": 1899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.38, + "grad_norm": 4.388787269592285, + "kl": 2.3041618540883064, + "learning_rate": 6.93465217000191e-07, + "loss": 0.2304, + "num_tokens": 16421768.0, + "reward": 0.73626708984375, + "reward_std": 0.017494281753897667, + "rewards//mean": 0.73626708984375, + "rewards//std": 0.04452718421816826, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3802, + "grad_norm": 2.6694602966308594, + "kl": 0.9762528147548437, + "learning_rate": 6.931725628465642e-07, + "loss": 0.0976, + "num_tokens": 16430400.0, + "reward": 0.74591064453125, + "reward_std": 0.0068107424303889275, + "rewards//mean": 0.74591064453125, + "rewards//std": 0.026350749656558037, + "step": 1901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3804, + "grad_norm": 1.481976866722107, + "kl": 1.0082335472106934, + "learning_rate": 6.928798308830523e-07, + "loss": 0.1008, + "num_tokens": 16439048.0, + "reward": 0.7430419921875, + "reward_std": 0.007673492655158043, + "rewards//mean": 0.7430419921875, + "rewards//std": 0.03300994262099266, + "step": 1902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3806, + "grad_norm": 2.5784013271331787, + "kl": 1.025176851078868, + "learning_rate": 6.925870212275676e-07, + "loss": 0.1025, + "num_tokens": 16447656.0, + "reward": 0.72705078125, + "reward_std": 0.010677337646484375, + "rewards//mean": 0.72705078125, + "rewards//std": 0.03812427446246147, + "step": 1903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3808, + "grad_norm": 2.5363693237304688, + "kl": 1.1179410461336374, + "learning_rate": 6.922941339980537e-07, + "loss": 0.1118, + "num_tokens": 16456272.0, + "reward": 0.76568603515625, + "reward_std": 0.005083904135972261, + "rewards//mean": 0.76568603515625, + "rewards//std": 0.0326782688498497, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.381, + "grad_norm": 4.336783409118652, + "kl": 1.889224173501134, + "learning_rate": 6.920011693124856e-07, + "loss": 0.1889, + "num_tokens": 16464952.0, + "reward": 0.797119140625, + "reward_std": 0.016552994027733803, + "rewards//mean": 0.797119140625, + "rewards//std": 0.03982317075133324, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3812, + "grad_norm": 2.7505311965942383, + "kl": 1.2263763677328825, + "learning_rate": 6.917081272888696e-07, + "loss": 0.1226, + "num_tokens": 16473592.0, + "reward": 0.78375244140625, + "reward_std": 0.011583936400711536, + "rewards//mean": 0.78375244140625, + "rewards//std": 0.030941301956772804, + "step": 1906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3814, + "grad_norm": 3.5298731327056885, + "kl": 0.8228543344885111, + "learning_rate": 6.914150080452428e-07, + "loss": 0.0823, + "num_tokens": 16482232.0, + "reward": 0.7437744140625, + "reward_std": 0.004534014966338873, + "rewards//mean": 0.7437744140625, + "rewards//std": 0.0359480045735836, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3816, + "grad_norm": 1.8533520698547363, + "kl": 0.9186171144247055, + "learning_rate": 6.911218116996736e-07, + "loss": 0.0919, + "num_tokens": 16490856.0, + "reward": 0.76287841796875, + "reward_std": 0.005424214527010918, + "rewards//mean": 0.76287841796875, + "rewards//std": 0.020167943090200424, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3818, + "grad_norm": 4.732398986816406, + "kl": 2.199810292571783, + "learning_rate": 6.908285383702616e-07, + "loss": 0.22, + "num_tokens": 16499488.0, + "reward": 0.767333984375, + "reward_std": 0.016987569630146027, + "rewards//mean": 0.767333984375, + "rewards//std": 0.0346662774682045, + "step": 1909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.382, + "grad_norm": 0.935878336429596, + "kl": 0.6690200604498386, + "learning_rate": 6.905351881751371e-07, + "loss": 0.0669, + "num_tokens": 16508064.0, + "reward": 0.76055908203125, + "reward_std": 0.0030061921570450068, + "rewards//mean": 0.76055908203125, + "rewards//std": 0.03105313703417778, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3822, + "grad_norm": 3.0205533504486084, + "kl": 0.7351449299603701, + "learning_rate": 6.902417612324615e-07, + "loss": 0.0735, + "num_tokens": 16516688.0, + "reward": 0.71868896484375, + "reward_std": 0.002594362013041973, + "rewards//mean": 0.71868896484375, + "rewards//std": 0.026543639600276947, + "step": 1911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3824, + "grad_norm": 2.0582287311553955, + "kl": 0.8999587371945381, + "learning_rate": 6.899482576604274e-07, + "loss": 0.09, + "num_tokens": 16525384.0, + "reward": 0.71905517578125, + "reward_std": 0.005210091359913349, + "rewards//mean": 0.71905517578125, + "rewards//std": 0.02900966815650463, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3826, + "grad_norm": 3.1458547115325928, + "kl": 1.3339712284505367, + "learning_rate": 6.896546775772576e-07, + "loss": 0.1334, + "num_tokens": 16533976.0, + "reward": 0.75115966796875, + "reward_std": 0.005288252606987953, + "rewards//mean": 0.75115966796875, + "rewards//std": 0.029344825074076653, + "step": 1913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3828, + "grad_norm": 5.858672618865967, + "kl": 1.3377245664596558, + "learning_rate": 6.893610211012066e-07, + "loss": 0.1338, + "num_tokens": 16542632.0, + "reward": 0.74371337890625, + "reward_std": 0.005274464376270771, + "rewards//mean": 0.74371337890625, + "rewards//std": 0.024075891822576523, + "step": 1914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.383, + "grad_norm": 1.4700431823730469, + "kl": 0.9460238479077816, + "learning_rate": 6.890672883505588e-07, + "loss": 0.0946, + "num_tokens": 16551320.0, + "reward": 0.80389404296875, + "reward_std": 0.00816527009010315, + "rewards//mean": 0.80389404296875, + "rewards//std": 0.03156755119562149, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3832, + "grad_norm": 5.436630725860596, + "kl": 1.1437436919659376, + "learning_rate": 6.887734794436299e-07, + "loss": 0.1144, + "num_tokens": 16559944.0, + "reward": 0.79931640625, + "reward_std": 0.013120634481310844, + "rewards//mean": 0.79931640625, + "rewards//std": 0.02296249009668827, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3834, + "grad_norm": 7.216220378875732, + "kl": 2.6739409286528826, + "learning_rate": 6.884795944987661e-07, + "loss": 0.2674, + "num_tokens": 16568616.0, + "reward": 0.73291015625, + "reward_std": 0.016908852383494377, + "rewards//mean": 0.73291015625, + "rewards//std": 0.04493063688278198, + "step": 1917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3836, + "grad_norm": 2.0758872032165527, + "kl": 1.1840754691511393, + "learning_rate": 6.881856336343441e-07, + "loss": 0.1184, + "num_tokens": 16577184.0, + "reward": 0.77850341796875, + "reward_std": 0.010706901550292969, + "rewards//mean": 0.77850341796875, + "rewards//std": 0.024266904219985008, + "step": 1918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3838, + "grad_norm": 2.5120158195495605, + "kl": 1.9357288107275963, + "learning_rate": 6.878915969687714e-07, + "loss": 0.1936, + "num_tokens": 16585896.0, + "reward": 0.75335693359375, + "reward_std": 0.011784134432673454, + "rewards//mean": 0.75335693359375, + "rewards//std": 0.025637367740273476, + "step": 1919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.384, + "grad_norm": 4.904962062835693, + "kl": 1.3946696668863297, + "learning_rate": 6.875974846204858e-07, + "loss": 0.1395, + "num_tokens": 16594656.0, + "reward": 0.76751708984375, + "reward_std": 0.011228116229176521, + "rewards//mean": 0.76751708984375, + "rewards//std": 0.028981998562812805, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3842, + "grad_norm": 2.127612590789795, + "kl": 1.453100511804223, + "learning_rate": 6.87303296707956e-07, + "loss": 0.1453, + "num_tokens": 16603224.0, + "reward": 0.78070068359375, + "reward_std": 0.010628015734255314, + "rewards//mean": 0.78070068359375, + "rewards//std": 0.03292470797896385, + "step": 1921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3844, + "grad_norm": 1.864574909210205, + "kl": 1.8873520381748676, + "learning_rate": 6.870090333496806e-07, + "loss": 0.1887, + "num_tokens": 16611944.0, + "reward": 0.75738525390625, + "reward_std": 0.010470920242369175, + "rewards//mean": 0.75738525390625, + "rewards//std": 0.01807059533894062, + "step": 1922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3846, + "grad_norm": 3.190722703933716, + "kl": 1.9334096498787403, + "learning_rate": 6.867146946641891e-07, + "loss": 0.1933, + "num_tokens": 16620656.0, + "reward": 0.755859375, + "reward_std": 0.014163737185299397, + "rewards//mean": 0.755859375, + "rewards//std": 0.025610268115997314, + "step": 1923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3848, + "grad_norm": 4.621662139892578, + "kl": 0.9856888316571712, + "learning_rate": 6.864202807700407e-07, + "loss": 0.0986, + "num_tokens": 16629280.0, + "reward": 0.775634765625, + "reward_std": 0.0029898949433118105, + "rewards//mean": 0.775634765625, + "rewards//std": 0.022051827982068062, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.385, + "grad_norm": 2.4850211143493652, + "kl": 2.518883015960455, + "learning_rate": 6.861257917858257e-07, + "loss": 0.2519, + "num_tokens": 16637840.0, + "reward": 0.75048828125, + "reward_std": 0.018755264580249786, + "rewards//mean": 0.75048828125, + "rewards//std": 0.03805432841181755, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3852, + "grad_norm": 3.111382484436035, + "kl": 1.5108758825808764, + "learning_rate": 6.858312278301637e-07, + "loss": 0.1511, + "num_tokens": 16646456.0, + "reward": 0.758544921875, + "reward_std": 0.007399954367429018, + "rewards//mean": 0.758544921875, + "rewards//std": 0.028398511931300163, + "step": 1926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3854, + "grad_norm": 1.1277971267700195, + "kl": 1.0972880274057388, + "learning_rate": 6.855365890217056e-07, + "loss": 0.1097, + "num_tokens": 16655072.0, + "reward": 0.7708740234375, + "reward_std": 0.007282741833478212, + "rewards//mean": 0.7708740234375, + "rewards//std": 0.028791053220629692, + "step": 1927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3856, + "grad_norm": 2.733692169189453, + "kl": 1.738442301750183, + "learning_rate": 6.852418754791316e-07, + "loss": 0.1738, + "num_tokens": 16663784.0, + "reward": 0.82855224609375, + "reward_std": 0.009084641002118587, + "rewards//mean": 0.82855224609375, + "rewards//std": 0.020506612956523895, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3858, + "grad_norm": 8.57189655303955, + "kl": 2.6197225730866194, + "learning_rate": 6.849470873211522e-07, + "loss": 0.262, + "num_tokens": 16672448.0, + "reward": 0.72967529296875, + "reward_std": 0.01400065142661333, + "rewards//mean": 0.72967529296875, + "rewards//std": 0.04469108581542969, + "step": 1929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.386, + "grad_norm": 2.0354530811309814, + "kl": 0.7951034177094698, + "learning_rate": 6.846522246665083e-07, + "loss": 0.0795, + "num_tokens": 16681040.0, + "reward": 0.7459716796875, + "reward_std": 0.005416989326477051, + "rewards//mean": 0.7459716796875, + "rewards//std": 0.022423164919018745, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3862, + "grad_norm": 1.7759015560150146, + "kl": 1.3610915672034025, + "learning_rate": 6.843572876339704e-07, + "loss": 0.1361, + "num_tokens": 16689648.0, + "reward": 0.75347900390625, + "reward_std": 0.00892785657197237, + "rewards//mean": 0.75347900390625, + "rewards//std": 0.02415122464299202, + "step": 1931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3864, + "grad_norm": 2.468689203262329, + "kl": 0.974500609561801, + "learning_rate": 6.840622763423391e-07, + "loss": 0.0975, + "num_tokens": 16698256.0, + "reward": 0.74560546875, + "reward_std": 0.007358761504292488, + "rewards//mean": 0.74560546875, + "rewards//std": 0.031616006046533585, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3866, + "grad_norm": 1.4950695037841797, + "kl": 1.2004152946174145, + "learning_rate": 6.837671909104447e-07, + "loss": 0.12, + "num_tokens": 16706920.0, + "reward": 0.750732421875, + "reward_std": 0.0077482243068516254, + "rewards//mean": 0.750732421875, + "rewards//std": 0.03942586109042168, + "step": 1933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3868, + "grad_norm": 10.273947715759277, + "kl": 2.033162873238325, + "learning_rate": 6.834720314571479e-07, + "loss": 0.2033, + "num_tokens": 16715640.0, + "reward": 0.7603759765625, + "reward_std": 0.008452069014310837, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.03224346786737442, + "step": 1934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.387, + "grad_norm": 3.9013893604278564, + "kl": 1.0506266802549362, + "learning_rate": 6.831767981013388e-07, + "loss": 0.1051, + "num_tokens": 16724336.0, + "reward": 0.74005126953125, + "reward_std": 0.0036983320023864508, + "rewards//mean": 0.74005126953125, + "rewards//std": 0.025901123881340027, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3872, + "grad_norm": 8.675947189331055, + "kl": 1.2521464210003614, + "learning_rate": 6.828814909619372e-07, + "loss": 0.1252, + "num_tokens": 16733000.0, + "reward": 0.7713623046875, + "reward_std": 0.0072280410677194595, + "rewards//mean": 0.7713623046875, + "rewards//std": 0.02669356018304825, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3874, + "grad_norm": 4.412496089935303, + "kl": 1.6320644859224558, + "learning_rate": 6.82586110157893e-07, + "loss": 0.1632, + "num_tokens": 16741664.0, + "reward": 0.73004150390625, + "reward_std": 0.011390022933483124, + "rewards//mean": 0.73004150390625, + "rewards//std": 0.03067202866077423, + "step": 1937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3876, + "grad_norm": 3.2514443397521973, + "kl": 1.6396849621087313, + "learning_rate": 6.822906558081856e-07, + "loss": 0.164, + "num_tokens": 16750360.0, + "reward": 0.7725830078125, + "reward_std": 0.01013142429292202, + "rewards//mean": 0.7725830078125, + "rewards//std": 0.029243865981698036, + "step": 1938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3878, + "grad_norm": 2.7781484127044678, + "kl": 1.7112006973475218, + "learning_rate": 6.819951280318236e-07, + "loss": 0.1711, + "num_tokens": 16758984.0, + "reward": 0.7593994140625, + "reward_std": 0.012383747845888138, + "rewards//mean": 0.7593994140625, + "rewards//std": 0.032827842980623245, + "step": 1939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.388, + "grad_norm": 1.5462831258773804, + "kl": 0.8432313669472933, + "learning_rate": 6.816995269478459e-07, + "loss": 0.0843, + "num_tokens": 16767672.0, + "reward": 0.76971435546875, + "reward_std": 0.00866425596177578, + "rewards//mean": 0.76971435546875, + "rewards//std": 0.030357016250491142, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3882, + "grad_norm": 2.730386257171631, + "kl": 1.5600259955972433, + "learning_rate": 6.814038526753204e-07, + "loss": 0.156, + "num_tokens": 16776352.0, + "reward": 0.7388916015625, + "reward_std": 0.009221317246556282, + "rewards//mean": 0.7388916015625, + "rewards//std": 0.034787241369485855, + "step": 1941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3884, + "grad_norm": 8.104711532592773, + "kl": 1.7729483786970377, + "learning_rate": 6.811081053333449e-07, + "loss": 0.1773, + "num_tokens": 16784968.0, + "reward": 0.72039794921875, + "reward_std": 0.00957479514181614, + "rewards//mean": 0.72039794921875, + "rewards//std": 0.04459682106971741, + "step": 1942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3886, + "grad_norm": 2.123380422592163, + "kl": 1.1190021578222513, + "learning_rate": 6.80812285041046e-07, + "loss": 0.1119, + "num_tokens": 16793592.0, + "reward": 0.781494140625, + "reward_std": 0.0060582393780350685, + "rewards//mean": 0.781494140625, + "rewards//std": 0.017851572483778, + "step": 1943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3888, + "grad_norm": 1.211398959159851, + "kl": 1.0127136074006557, + "learning_rate": 6.805163919175806e-07, + "loss": 0.1013, + "num_tokens": 16802216.0, + "reward": 0.7286376953125, + "reward_std": 0.006353127770125866, + "rewards//mean": 0.7286376953125, + "rewards//std": 0.03037726692855358, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.389, + "grad_norm": 9.318678855895996, + "kl": 0.8566135875880718, + "learning_rate": 6.80220426082134e-07, + "loss": 0.0857, + "num_tokens": 16810928.0, + "reward": 0.77288818359375, + "reward_std": 0.007881369441747665, + "rewards//mean": 0.77288818359375, + "rewards//std": 0.020432662218809128, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3892, + "grad_norm": 2.9845526218414307, + "kl": 1.3708241041749716, + "learning_rate": 6.799243876539213e-07, + "loss": 0.1371, + "num_tokens": 16819560.0, + "reward": 0.73297119140625, + "reward_std": 0.010097953490912914, + "rewards//mean": 0.73297119140625, + "rewards//std": 0.03179832175374031, + "step": 1946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3894, + "grad_norm": 3.9203383922576904, + "kl": 0.6361210681498051, + "learning_rate": 6.796282767521869e-07, + "loss": 0.0636, + "num_tokens": 16828184.0, + "reward": 0.740966796875, + "reward_std": 0.001035800902172923, + "rewards//mean": 0.740966796875, + "rewards//std": 0.026354841887950897, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3896, + "grad_norm": 3.9711384773254395, + "kl": 2.2377002704888582, + "learning_rate": 6.793320934962038e-07, + "loss": 0.2238, + "num_tokens": 16836816.0, + "reward": 0.7701416015625, + "reward_std": 0.015632741153240204, + "rewards//mean": 0.7701416015625, + "rewards//std": 0.03237839788198471, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3898, + "grad_norm": 4.52419900894165, + "kl": 2.323673529550433, + "learning_rate": 6.790358380052751e-07, + "loss": 0.2324, + "num_tokens": 16845448.0, + "reward": 0.71630859375, + "reward_std": 0.014104368165135384, + "rewards//mean": 0.71630859375, + "rewards//std": 0.03141619265079498, + "step": 1949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.39, + "grad_norm": 2.7977373600006104, + "kl": 1.09340226277709, + "learning_rate": 6.787395103987322e-07, + "loss": 0.1093, + "num_tokens": 16854072.0, + "reward": 0.72857666015625, + "reward_std": 0.005173890385776758, + "rewards//mean": 0.72857666015625, + "rewards//std": 0.023083388805389404, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3902, + "grad_norm": 5.4792633056640625, + "kl": 2.648605877533555, + "learning_rate": 6.784431107959358e-07, + "loss": 0.2649, + "num_tokens": 16862752.0, + "reward": 0.75103759765625, + "reward_std": 0.015550365671515465, + "rewards//mean": 0.75103759765625, + "rewards//std": 0.04376595467329025, + "step": 1951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3904, + "grad_norm": 1.7156985998153687, + "kl": 1.6892276592552662, + "learning_rate": 6.781466393162761e-07, + "loss": 0.1689, + "num_tokens": 16871304.0, + "reward": 0.7227783203125, + "reward_std": 0.008895116858184338, + "rewards//mean": 0.7227783203125, + "rewards//std": 0.04620348662137985, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3906, + "grad_norm": 4.42630672454834, + "kl": 2.740915870293975, + "learning_rate": 6.778500960791708e-07, + "loss": 0.2741, + "num_tokens": 16879952.0, + "reward": 0.74871826171875, + "reward_std": 0.016002818942070007, + "rewards//mean": 0.74871826171875, + "rewards//std": 0.04002445936203003, + "step": 1953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3908, + "grad_norm": 4.884814739227295, + "kl": 1.6524646487087011, + "learning_rate": 6.775534812040686e-07, + "loss": 0.1652, + "num_tokens": 16888640.0, + "reward": 0.762451171875, + "reward_std": 0.006168714724481106, + "rewards//mean": 0.762451171875, + "rewards//std": 0.024058358743786812, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.391, + "grad_norm": 5.532837867736816, + "kl": 1.9937932193279266, + "learning_rate": 6.772567948104452e-07, + "loss": 0.1994, + "num_tokens": 16897224.0, + "reward": 0.75537109375, + "reward_std": 0.021407444030046463, + "rewards//mean": 0.75537109375, + "rewards//std": 0.03148550167679787, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3912, + "grad_norm": 9.207106590270996, + "kl": 1.4597187489271164, + "learning_rate": 6.769600370178059e-07, + "loss": 0.146, + "num_tokens": 16905904.0, + "reward": 0.74700927734375, + "reward_std": 0.007965510711073875, + "rewards//mean": 0.74700927734375, + "rewards//std": 0.034030720591545105, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3914, + "grad_norm": 4.234192848205566, + "kl": 1.5983269568532705, + "learning_rate": 6.766632079456851e-07, + "loss": 0.1598, + "num_tokens": 16914576.0, + "reward": 0.75140380859375, + "reward_std": 0.015184727497398853, + "rewards//mean": 0.75140380859375, + "rewards//std": 0.03192754462361336, + "step": 1957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3916, + "grad_norm": 2.5036401748657227, + "kl": 1.0536338668316603, + "learning_rate": 6.76366307713645e-07, + "loss": 0.1054, + "num_tokens": 16923160.0, + "reward": 0.71319580078125, + "reward_std": 0.005778812803328037, + "rewards//mean": 0.71319580078125, + "rewards//std": 0.03547382354736328, + "step": 1958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3918, + "grad_norm": 5.060679912567139, + "kl": 1.984919572249055, + "learning_rate": 6.760693364412775e-07, + "loss": 0.1985, + "num_tokens": 16931832.0, + "reward": 0.77227783203125, + "reward_std": 0.011234838515520096, + "rewards//mean": 0.77227783203125, + "rewards//std": 0.03165566176176071, + "step": 1959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.392, + "grad_norm": 9.679033279418945, + "kl": 0.9287089873105288, + "learning_rate": 6.757722942482022e-07, + "loss": 0.0929, + "num_tokens": 16940536.0, + "reward": 0.7427978515625, + "reward_std": 0.0045805806294083595, + "rewards//mean": 0.7427978515625, + "rewards//std": 0.02992946282029152, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3922, + "grad_norm": 6.345630645751953, + "kl": 2.2394301649183035, + "learning_rate": 6.754751812540679e-07, + "loss": 0.2239, + "num_tokens": 16949200.0, + "reward": 0.7667236328125, + "reward_std": 0.013151612132787704, + "rewards//mean": 0.7667236328125, + "rewards//std": 0.033323951065540314, + "step": 1961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3924, + "grad_norm": 6.910009860992432, + "kl": 2.583277940750122, + "learning_rate": 6.751779975785514e-07, + "loss": 0.2583, + "num_tokens": 16957848.0, + "reward": 0.741455078125, + "reward_std": 0.014633771032094955, + "rewards//mean": 0.741455078125, + "rewards//std": 0.04051057994365692, + "step": 1962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3926, + "grad_norm": 7.566949367523193, + "kl": 1.6424854304641485, + "learning_rate": 6.748807433413586e-07, + "loss": 0.1642, + "num_tokens": 16966464.0, + "reward": 0.78973388671875, + "reward_std": 0.01055777445435524, + "rewards//mean": 0.78973388671875, + "rewards//std": 0.029020102694630623, + "step": 1963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3928, + "grad_norm": 15.303129196166992, + "kl": 3.1461045145988464, + "learning_rate": 6.745834186622231e-07, + "loss": 0.3146, + "num_tokens": 16975304.0, + "reward": 0.73602294921875, + "reward_std": 0.013409584760665894, + "rewards//mean": 0.73602294921875, + "rewards//std": 0.03864988312125206, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.393, + "grad_norm": 5.361306190490723, + "kl": 1.765540674328804, + "learning_rate": 6.742860236609076e-07, + "loss": 0.1766, + "num_tokens": 16983960.0, + "reward": 0.739990234375, + "reward_std": 0.014041764661669731, + "rewards//mean": 0.739990234375, + "rewards//std": 0.03307156264781952, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3932, + "grad_norm": 4.571110725402832, + "kl": 1.7866956647485495, + "learning_rate": 6.739885584572025e-07, + "loss": 0.1787, + "num_tokens": 16992616.0, + "reward": 0.7611083984375, + "reward_std": 0.009677095338702202, + "rewards//mean": 0.7611083984375, + "rewards//std": 0.02764080837368965, + "step": 1966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3934, + "grad_norm": 9.737143516540527, + "kl": 1.4153119344264269, + "learning_rate": 6.73691023170927e-07, + "loss": 0.1415, + "num_tokens": 17001224.0, + "reward": 0.74267578125, + "reward_std": 0.009522315114736557, + "rewards//mean": 0.74267578125, + "rewards//std": 0.044169504195451736, + "step": 1967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3936, + "grad_norm": 8.495573997497559, + "kl": 1.613790376111865, + "learning_rate": 6.733934179219281e-07, + "loss": 0.1614, + "num_tokens": 17009864.0, + "reward": 0.76177978515625, + "reward_std": 0.013794094324111938, + "rewards//mean": 0.76177978515625, + "rewards//std": 0.027067698538303375, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3938, + "grad_norm": 3.696143388748169, + "kl": 1.8864859715104103, + "learning_rate": 6.730957428300811e-07, + "loss": 0.1886, + "num_tokens": 17018376.0, + "reward": 0.7576904296875, + "reward_std": 0.014782344922423363, + "rewards//mean": 0.7576904296875, + "rewards//std": 0.03793619945645332, + "step": 1969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.394, + "grad_norm": 3.1203043460845947, + "kl": 0.6857241913676262, + "learning_rate": 6.727979980152898e-07, + "loss": 0.0686, + "num_tokens": 17026976.0, + "reward": 0.77105712890625, + "reward_std": 0.005191301926970482, + "rewards//mean": 0.77105712890625, + "rewards//std": 0.02950071543455124, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3942, + "grad_norm": 2.8411827087402344, + "kl": 0.9823378846049309, + "learning_rate": 6.725001835974852e-07, + "loss": 0.0982, + "num_tokens": 17035568.0, + "reward": 0.76788330078125, + "reward_std": 0.005136963911354542, + "rewards//mean": 0.76788330078125, + "rewards//std": 0.025159969925880432, + "step": 1971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3944, + "grad_norm": 3.5230236053466797, + "kl": 0.899571318179369, + "learning_rate": 6.722022996966277e-07, + "loss": 0.09, + "num_tokens": 17044176.0, + "reward": 0.7330322265625, + "reward_std": 0.003262493060901761, + "rewards//mean": 0.7330322265625, + "rewards//std": 0.02540343999862671, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3946, + "grad_norm": 5.399919033050537, + "kl": 1.9057795070111752, + "learning_rate": 6.719043464327042e-07, + "loss": 0.1906, + "num_tokens": 17052752.0, + "reward": 0.75933837890625, + "reward_std": 0.01613207533955574, + "rewards//mean": 0.75933837890625, + "rewards//std": 0.03235892951488495, + "step": 1973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3948, + "grad_norm": 5.980194568634033, + "kl": 1.6304620802402496, + "learning_rate": 6.716063239257306e-07, + "loss": 0.163, + "num_tokens": 17061416.0, + "reward": 0.75360107421875, + "reward_std": 0.010458397679030895, + "rewards//mean": 0.75360107421875, + "rewards//std": 0.040333397686481476, + "step": 1974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.395, + "grad_norm": 8.40462589263916, + "kl": 2.120277812704444, + "learning_rate": 6.713082322957502e-07, + "loss": 0.212, + "num_tokens": 17070008.0, + "reward": 0.7569580078125, + "reward_std": 0.01008752454072237, + "rewards//mean": 0.7569580078125, + "rewards//std": 0.028717348352074623, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3952, + "grad_norm": 17.99199867248535, + "kl": 2.275618724524975, + "learning_rate": 6.710100716628344e-07, + "loss": 0.2276, + "num_tokens": 17078680.0, + "reward": 0.75274658203125, + "reward_std": 0.013096587732434273, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.03835975006222725, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3954, + "grad_norm": 10.808684349060059, + "kl": 2.4570872634649277, + "learning_rate": 6.70711842147082e-07, + "loss": 0.2457, + "num_tokens": 17087232.0, + "reward": 0.74688720703125, + "reward_std": 0.013904260471463203, + "rewards//mean": 0.74688720703125, + "rewards//std": 0.03641964867711067, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3956, + "grad_norm": 12.57481861114502, + "kl": 2.9545410871505737, + "learning_rate": 6.704135438686203e-07, + "loss": 0.2955, + "num_tokens": 17095856.0, + "reward": 0.76068115234375, + "reward_std": 0.011217588558793068, + "rewards//mean": 0.76068115234375, + "rewards//std": 0.03128286078572273, + "step": 1978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3958, + "grad_norm": 16.341278076171875, + "kl": 2.0866538248956203, + "learning_rate": 6.701151769476032e-07, + "loss": 0.2087, + "num_tokens": 17104456.0, + "reward": 0.75799560546875, + "reward_std": 0.009943559765815735, + "rewards//mean": 0.75799560546875, + "rewards//std": 0.028943846002221107, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.396, + "grad_norm": 20.670608520507812, + "kl": 2.6539546567946672, + "learning_rate": 6.698167415042134e-07, + "loss": 0.2654, + "num_tokens": 17113160.0, + "reward": 0.72332763671875, + "reward_std": 0.01024525985121727, + "rewards//mean": 0.72332763671875, + "rewards//std": 0.034114688634872437, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3962, + "grad_norm": 26.590112686157227, + "kl": 3.4797907043248415, + "learning_rate": 6.695182376586602e-07, + "loss": 0.348, + "num_tokens": 17121936.0, + "reward": 0.74481201171875, + "reward_std": 0.020049354061484337, + "rewards//mean": 0.74481201171875, + "rewards//std": 0.044971343129873276, + "step": 1981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3964, + "grad_norm": 5.345390796661377, + "kl": 1.8074094392359257, + "learning_rate": 6.692196655311814e-07, + "loss": 0.1807, + "num_tokens": 17130496.0, + "reward": 0.7347412109375, + "reward_std": 0.014416481368243694, + "rewards//mean": 0.7347412109375, + "rewards//std": 0.0392586775124073, + "step": 1982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3966, + "grad_norm": 19.076135635375977, + "kl": 2.9417112972587347, + "learning_rate": 6.689210252420415e-07, + "loss": 0.2942, + "num_tokens": 17139192.0, + "reward": 0.7327880859375, + "reward_std": 0.021368755027651787, + "rewards//mean": 0.7327880859375, + "rewards//std": 0.05111980438232422, + "step": 1983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3968, + "grad_norm": 21.99047088623047, + "kl": 3.586339859291911, + "learning_rate": 6.686223169115327e-07, + "loss": 0.3586, + "num_tokens": 17147984.0, + "reward": 0.7293701171875, + "reward_std": 0.010650159791111946, + "rewards//mean": 0.7293701171875, + "rewards//std": 0.03817486763000488, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.397, + "grad_norm": 7.656482219696045, + "kl": 0.8482975848019123, + "learning_rate": 6.683235406599749e-07, + "loss": 0.0848, + "num_tokens": 17156640.0, + "reward": 0.7391357421875, + "reward_std": 0.005536393262445927, + "rewards//mean": 0.7391357421875, + "rewards//std": 0.035973262041807175, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3972, + "grad_norm": 8.579667091369629, + "kl": 2.4071076661348343, + "learning_rate": 6.68024696607715e-07, + "loss": 0.2407, + "num_tokens": 17165376.0, + "reward": 0.76629638671875, + "reward_std": 0.0118489945307374, + "rewards//mean": 0.76629638671875, + "rewards//std": 0.02634270489215851, + "step": 1986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3974, + "grad_norm": 3.7696523666381836, + "kl": 1.5843081548810005, + "learning_rate": 6.677257848751276e-07, + "loss": 0.1584, + "num_tokens": 17174008.0, + "reward": 0.78240966796875, + "reward_std": 0.009303020313382149, + "rewards//mean": 0.78240966796875, + "rewards//std": 0.024741342291235924, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3976, + "grad_norm": 2.2588491439819336, + "kl": 0.9959504008293152, + "learning_rate": 6.674268055826138e-07, + "loss": 0.0996, + "num_tokens": 17182720.0, + "reward": 0.76434326171875, + "reward_std": 0.006129647605121136, + "rewards//mean": 0.76434326171875, + "rewards//std": 0.02718878537416458, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3978, + "grad_norm": 5.689353942871094, + "kl": 0.9194694440811872, + "learning_rate": 6.671277588506029e-07, + "loss": 0.0919, + "num_tokens": 17191400.0, + "reward": 0.7371826171875, + "reward_std": 0.0032536799553781748, + "rewards//mean": 0.7371826171875, + "rewards//std": 0.03268550708889961, + "step": 1989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.398, + "grad_norm": 2.835458993911743, + "kl": 0.6638965383172035, + "learning_rate": 6.668286447995507e-07, + "loss": 0.0664, + "num_tokens": 17200032.0, + "reward": 0.748779296875, + "reward_std": 0.0030498532578349113, + "rewards//mean": 0.748779296875, + "rewards//std": 0.025514332577586174, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3982, + "grad_norm": 4.356527328491211, + "kl": 1.531398318707943, + "learning_rate": 6.665294635499403e-07, + "loss": 0.1531, + "num_tokens": 17208680.0, + "reward": 0.77423095703125, + "reward_std": 0.01328960806131363, + "rewards//mean": 0.77423095703125, + "rewards//std": 0.027447011321783066, + "step": 1991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3984, + "grad_norm": 6.774599075317383, + "kl": 0.9724132753908634, + "learning_rate": 6.66230215222282e-07, + "loss": 0.0972, + "num_tokens": 17217296.0, + "reward": 0.76702880859375, + "reward_std": 0.009563660249114037, + "rewards//mean": 0.76702880859375, + "rewards//std": 0.03426919877529144, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3986, + "grad_norm": 2.9007930755615234, + "kl": 1.5415233988314867, + "learning_rate": 6.659308999371129e-07, + "loss": 0.1542, + "num_tokens": 17225960.0, + "reward": 0.7188720703125, + "reward_std": 0.012013616971671581, + "rewards//mean": 0.7188720703125, + "rewards//std": 0.03355933353304863, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3988, + "grad_norm": 3.7563085556030273, + "kl": 1.1901140250265598, + "learning_rate": 6.65631517814997e-07, + "loss": 0.119, + "num_tokens": 17234600.0, + "reward": 0.74310302734375, + "reward_std": 0.008727531880140305, + "rewards//mean": 0.74310302734375, + "rewards//std": 0.031702011823654175, + "step": 1994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.399, + "grad_norm": 2.4339611530303955, + "kl": 1.5453631542623043, + "learning_rate": 6.653320689765256e-07, + "loss": 0.1545, + "num_tokens": 17243224.0, + "reward": 0.76031494140625, + "reward_std": 0.01022228505462408, + "rewards//mean": 0.76031494140625, + "rewards//std": 0.026100818067789078, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3992, + "grad_norm": 3.2661845684051514, + "kl": 1.124980976805091, + "learning_rate": 6.650325535423166e-07, + "loss": 0.1125, + "num_tokens": 17251840.0, + "reward": 0.7269287109375, + "reward_std": 0.007605080492794514, + "rewards//mean": 0.7269287109375, + "rewards//std": 0.027539854869246483, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3994, + "grad_norm": 2.4316375255584717, + "kl": 0.7848870139569044, + "learning_rate": 6.647329716330147e-07, + "loss": 0.0785, + "num_tokens": 17260456.0, + "reward": 0.73541259765625, + "reward_std": 0.007167475763708353, + "rewards//mean": 0.73541259765625, + "rewards//std": 0.0317058339715004, + "step": 1997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3996, + "grad_norm": 3.2776546478271484, + "kl": 1.6006934456527233, + "learning_rate": 6.644333233692916e-07, + "loss": 0.1601, + "num_tokens": 17269256.0, + "reward": 0.7718505859375, + "reward_std": 0.013405588455498219, + "rewards//mean": 0.7718505859375, + "rewards//std": 0.029312115162611008, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.3998, + "grad_norm": 2.0470504760742188, + "kl": 0.7851455882191658, + "learning_rate": 6.641336088718456e-07, + "loss": 0.0785, + "num_tokens": 17277904.0, + "reward": 0.7802734375, + "reward_std": 0.007900599390268326, + "rewards//mean": 0.7802734375, + "rewards//std": 0.02856115810573101, + "step": 1999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4, + "grad_norm": 2.0166735649108887, + "kl": 1.69225930608809, + "learning_rate": 6.638338282614014e-07, + "loss": 0.1692, + "num_tokens": 17286544.0, + "reward": 0.71392822265625, + "reward_std": 0.01248107198625803, + "rewards//mean": 0.71392822265625, + "rewards//std": 0.03881208226084709, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4002, + "grad_norm": 1.9074249267578125, + "kl": 1.2381714042276144, + "learning_rate": 6.635339816587108e-07, + "loss": 0.1238, + "num_tokens": 17295104.0, + "reward": 0.75152587890625, + "reward_std": 0.007324153557419777, + "rewards//mean": 0.75152587890625, + "rewards//std": 0.034412022680044174, + "step": 2001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4004, + "grad_norm": 3.627312660217285, + "kl": 1.4870726894587278, + "learning_rate": 6.632340691845519e-07, + "loss": 0.1487, + "num_tokens": 17303720.0, + "reward": 0.74755859375, + "reward_std": 0.00930589996278286, + "rewards//mean": 0.74755859375, + "rewards//std": 0.03138533979654312, + "step": 2002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4006, + "grad_norm": 1.96901273727417, + "kl": 0.5314656235277653, + "learning_rate": 6.629340909597297e-07, + "loss": 0.0531, + "num_tokens": 17312384.0, + "reward": 0.77191162109375, + "reward_std": 0.002374867908656597, + "rewards//mean": 0.77191162109375, + "rewards//std": 0.015400438569486141, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4008, + "grad_norm": 6.3772053718566895, + "kl": 2.1691827047616243, + "learning_rate": 6.626340471050748e-07, + "loss": 0.2169, + "num_tokens": 17321056.0, + "reward": 0.7401123046875, + "reward_std": 0.01127574872225523, + "rewards//mean": 0.7401123046875, + "rewards//std": 0.041304513812065125, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.401, + "grad_norm": 3.0683350563049316, + "kl": 0.9349662065505981, + "learning_rate": 6.623339377414455e-07, + "loss": 0.0935, + "num_tokens": 17329616.0, + "reward": 0.7767333984375, + "reward_std": 0.006449653767049313, + "rewards//mean": 0.7767333984375, + "rewards//std": 0.02956306003034115, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4012, + "grad_norm": 2.7204205989837646, + "kl": 0.8533254358917475, + "learning_rate": 6.620337629897252e-07, + "loss": 0.0853, + "num_tokens": 17338240.0, + "reward": 0.75372314453125, + "reward_std": 0.005696904845535755, + "rewards//mean": 0.75372314453125, + "rewards//std": 0.0260456632822752, + "step": 2006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4014, + "grad_norm": 3.8325419425964355, + "kl": 1.0298786275088787, + "learning_rate": 6.617335229708248e-07, + "loss": 0.103, + "num_tokens": 17346936.0, + "reward": 0.77374267578125, + "reward_std": 0.0034789713099598885, + "rewards//mean": 0.77374267578125, + "rewards//std": 0.022394370287656784, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4016, + "grad_norm": 6.680671691894531, + "kl": 1.0986030958592892, + "learning_rate": 6.614332178056805e-07, + "loss": 0.1099, + "num_tokens": 17355520.0, + "reward": 0.74755859375, + "reward_std": 0.004781857132911682, + "rewards//mean": 0.74755859375, + "rewards//std": 0.03362805023789406, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4018, + "grad_norm": 4.605096817016602, + "kl": 1.0114766340702772, + "learning_rate": 6.611328476152556e-07, + "loss": 0.1011, + "num_tokens": 17364200.0, + "reward": 0.75799560546875, + "reward_std": 0.006291838828474283, + "rewards//mean": 0.75799560546875, + "rewards//std": 0.023024950176477432, + "step": 2009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.402, + "grad_norm": 2.8837902545928955, + "kl": 1.138092340901494, + "learning_rate": 6.608324125205387e-07, + "loss": 0.1138, + "num_tokens": 17372856.0, + "reward": 0.7261962890625, + "reward_std": 0.010601367801427841, + "rewards//mean": 0.7261962890625, + "rewards//std": 0.02555079385638237, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4022, + "grad_norm": 2.175081491470337, + "kl": 0.7374224457889795, + "learning_rate": 6.605319126425453e-07, + "loss": 0.0737, + "num_tokens": 17381512.0, + "reward": 0.733154296875, + "reward_std": 0.006387358531355858, + "rewards//mean": 0.733154296875, + "rewards//std": 0.03570568561553955, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4024, + "grad_norm": 3.8558781147003174, + "kl": 0.9677978456020355, + "learning_rate": 6.60231348102317e-07, + "loss": 0.0968, + "num_tokens": 17390064.0, + "reward": 0.780029296875, + "reward_std": 0.008658995851874352, + "rewards//mean": 0.780029296875, + "rewards//std": 0.01888020895421505, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4026, + "grad_norm": 8.65006160736084, + "kl": 0.778912840411067, + "learning_rate": 6.599307190209204e-07, + "loss": 0.0779, + "num_tokens": 17398672.0, + "reward": 0.7734375, + "reward_std": 0.00610438734292984, + "rewards//mean": 0.7734375, + "rewards//std": 0.029065513983368874, + "step": 2013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4028, + "grad_norm": 2.8029911518096924, + "kl": 0.7555165309458971, + "learning_rate": 6.596300255194496e-07, + "loss": 0.0756, + "num_tokens": 17407192.0, + "reward": 0.7578125, + "reward_std": 0.005495252087712288, + "rewards//mean": 0.7578125, + "rewards//std": 0.028527216985821724, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.403, + "grad_norm": 4.159221649169922, + "kl": 1.9968494437634945, + "learning_rate": 6.593292677190235e-07, + "loss": 0.1997, + "num_tokens": 17415832.0, + "reward": 0.7930908203125, + "reward_std": 0.017496876418590546, + "rewards//mean": 0.7930908203125, + "rewards//std": 0.026829317212104797, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4032, + "grad_norm": 2.3279263973236084, + "kl": 1.0599929634481668, + "learning_rate": 6.590284457407875e-07, + "loss": 0.106, + "num_tokens": 17424520.0, + "reward": 0.763916015625, + "reward_std": 0.007199055049568415, + "rewards//mean": 0.763916015625, + "rewards//std": 0.03292476385831833, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4034, + "grad_norm": 1.2485874891281128, + "kl": 0.6027565095573664, + "learning_rate": 6.587275597059124e-07, + "loss": 0.0603, + "num_tokens": 17433128.0, + "reward": 0.75775146484375, + "reward_std": 0.0025052325800061226, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.02604624442756176, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4036, + "grad_norm": 1.3312336206436157, + "kl": 0.6760806702077389, + "learning_rate": 6.584266097355954e-07, + "loss": 0.0676, + "num_tokens": 17441840.0, + "reward": 0.7568359375, + "reward_std": 0.005553403869271278, + "rewards//mean": 0.7568359375, + "rewards//std": 0.02626393362879753, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4038, + "grad_norm": 1.3900004625320435, + "kl": 0.5901344828307629, + "learning_rate": 6.581255959510588e-07, + "loss": 0.059, + "num_tokens": 17450496.0, + "reward": 0.758544921875, + "reward_std": 0.0037135255988687277, + "rewards//mean": 0.758544921875, + "rewards//std": 0.029910489916801453, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.404, + "grad_norm": 0.478079229593277, + "kl": 0.4421610552817583, + "learning_rate": 6.578245184735512e-07, + "loss": 0.0442, + "num_tokens": 17459200.0, + "reward": 0.77752685546875, + "reward_std": 0.0007886359235271811, + "rewards//mean": 0.77752685546875, + "rewards//std": 0.02180390991270542, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4042, + "grad_norm": 1.9101742506027222, + "kl": 1.9482213277369738, + "learning_rate": 6.575233774243464e-07, + "loss": 0.1948, + "num_tokens": 17467824.0, + "reward": 0.7493896484375, + "reward_std": 0.015871524810791016, + "rewards//mean": 0.7493896484375, + "rewards//std": 0.0384955070912838, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4044, + "grad_norm": 3.749629259109497, + "kl": 0.8403290398418903, + "learning_rate": 6.57222172924744e-07, + "loss": 0.084, + "num_tokens": 17476480.0, + "reward": 0.77191162109375, + "reward_std": 0.004095377400517464, + "rewards//mean": 0.77191162109375, + "rewards//std": 0.026581251993775368, + "step": 2022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4046, + "grad_norm": 5.751692771911621, + "kl": 1.149877868592739, + "learning_rate": 6.569209050960691e-07, + "loss": 0.115, + "num_tokens": 17485112.0, + "reward": 0.77630615234375, + "reward_std": 0.0036429085303097963, + "rewards//mean": 0.77630615234375, + "rewards//std": 0.030002394691109657, + "step": 2023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4048, + "grad_norm": 2.524386167526245, + "kl": 1.0947369653731585, + "learning_rate": 6.566195740596725e-07, + "loss": 0.1095, + "num_tokens": 17493744.0, + "reward": 0.787353515625, + "reward_std": 0.007318058051168919, + "rewards//mean": 0.787353515625, + "rewards//std": 0.01772904209792614, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.405, + "grad_norm": 3.3059611320495605, + "kl": 1.7416103146970272, + "learning_rate": 6.563181799369301e-07, + "loss": 0.1742, + "num_tokens": 17502384.0, + "reward": 0.75634765625, + "reward_std": 0.01362548116594553, + "rewards//mean": 0.75634765625, + "rewards//std": 0.03023766539990902, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4052, + "grad_norm": 7.124985694885254, + "kl": 3.0250588040798903, + "learning_rate": 6.560167228492434e-07, + "loss": 0.3025, + "num_tokens": 17511024.0, + "reward": 0.7303466796875, + "reward_std": 0.01609044149518013, + "rewards//mean": 0.7303466796875, + "rewards//std": 0.03682827204465866, + "step": 2026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4054, + "grad_norm": 9.336557388305664, + "kl": 2.6373706106096506, + "learning_rate": 6.557152029180397e-07, + "loss": 0.2637, + "num_tokens": 17519752.0, + "reward": 0.75543212890625, + "reward_std": 0.011776036582887173, + "rewards//mean": 0.75543212890625, + "rewards//std": 0.027068817988038063, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4056, + "grad_norm": 2.162250518798828, + "kl": 1.503698781132698, + "learning_rate": 6.554136202647706e-07, + "loss": 0.1504, + "num_tokens": 17528472.0, + "reward": 0.76861572265625, + "reward_std": 0.010287073440849781, + "rewards//mean": 0.76861572265625, + "rewards//std": 0.03548705205321312, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4058, + "grad_norm": 4.032214641571045, + "kl": 2.242517886683345, + "learning_rate": 6.551119750109141e-07, + "loss": 0.2243, + "num_tokens": 17537072.0, + "reward": 0.75390625, + "reward_std": 0.017234783619642258, + "rewards//mean": 0.75390625, + "rewards//std": 0.03577430173754692, + "step": 2029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.406, + "grad_norm": 2.4072046279907227, + "kl": 1.6082334611564875, + "learning_rate": 6.548102672779724e-07, + "loss": 0.1608, + "num_tokens": 17545680.0, + "reward": 0.77557373046875, + "reward_std": 0.011320850811898708, + "rewards//mean": 0.77557373046875, + "rewards//std": 0.03452664613723755, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4062, + "grad_norm": 7.844313144683838, + "kl": 2.640589749440551, + "learning_rate": 6.545084971874736e-07, + "loss": 0.2641, + "num_tokens": 17554368.0, + "reward": 0.7647705078125, + "reward_std": 0.00841673742979765, + "rewards//mean": 0.7647705078125, + "rewards//std": 0.030165238305926323, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4064, + "grad_norm": 1.7076689004898071, + "kl": 1.6775240413844585, + "learning_rate": 6.542066648609707e-07, + "loss": 0.1678, + "num_tokens": 17563080.0, + "reward": 0.76727294921875, + "reward_std": 0.009338431060314178, + "rewards//mean": 0.76727294921875, + "rewards//std": 0.033253636211156845, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4066, + "grad_norm": 3.1234536170959473, + "kl": 1.291312212124467, + "learning_rate": 6.539047704200417e-07, + "loss": 0.1291, + "num_tokens": 17571640.0, + "reward": 0.759033203125, + "reward_std": 0.006047561764717102, + "rewards//mean": 0.759033203125, + "rewards//std": 0.02012219838798046, + "step": 2033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4068, + "grad_norm": 1.9934250116348267, + "kl": 2.064103761687875, + "learning_rate": 6.536028139862893e-07, + "loss": 0.2064, + "num_tokens": 17580272.0, + "reward": 0.75537109375, + "reward_std": 0.018664442002773285, + "rewards//mean": 0.75537109375, + "rewards//std": 0.03693027421832085, + "step": 2034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.407, + "grad_norm": 2.274355173110962, + "kl": 1.78364316560328, + "learning_rate": 6.53300795681342e-07, + "loss": 0.1784, + "num_tokens": 17588904.0, + "reward": 0.74078369140625, + "reward_std": 0.011885635554790497, + "rewards//mean": 0.74078369140625, + "rewards//std": 0.029954928904771805, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4072, + "grad_norm": 3.210240364074707, + "kl": 2.0895239897072315, + "learning_rate": 6.529987156268526e-07, + "loss": 0.209, + "num_tokens": 17597544.0, + "reward": 0.76617431640625, + "reward_std": 0.012959875166416168, + "rewards//mean": 0.76617431640625, + "rewards//std": 0.03596247732639313, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4074, + "grad_norm": 9.994877815246582, + "kl": 1.90946414321661, + "learning_rate": 6.526965739444988e-07, + "loss": 0.1909, + "num_tokens": 17606280.0, + "reward": 0.736572265625, + "reward_std": 0.0067734369076788425, + "rewards//mean": 0.736572265625, + "rewards//std": 0.02389673888683319, + "step": 2037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4076, + "grad_norm": 4.8204345703125, + "kl": 1.5475333109498024, + "learning_rate": 6.523943707559832e-07, + "loss": 0.1548, + "num_tokens": 17614984.0, + "reward": 0.74163818359375, + "reward_std": 0.005846993066370487, + "rewards//mean": 0.74163818359375, + "rewards//std": 0.04733194783329964, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4078, + "grad_norm": 2.098745822906494, + "kl": 1.0994709637016058, + "learning_rate": 6.520921061830333e-07, + "loss": 0.1099, + "num_tokens": 17623688.0, + "reward": 0.7578125, + "reward_std": 0.00951296091079712, + "rewards//mean": 0.7578125, + "rewards//std": 0.03535205125808716, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.408, + "grad_norm": 2.9249560832977295, + "kl": 1.3069669771939516, + "learning_rate": 6.517897803474011e-07, + "loss": 0.1307, + "num_tokens": 17632304.0, + "reward": 0.77850341796875, + "reward_std": 0.008851700462400913, + "rewards//mean": 0.77850341796875, + "rewards//std": 0.024926647543907166, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4082, + "grad_norm": 2.5026135444641113, + "kl": 1.3848806973546743, + "learning_rate": 6.514873933708637e-07, + "loss": 0.1385, + "num_tokens": 17640920.0, + "reward": 0.74365234375, + "reward_std": 0.009737861342728138, + "rewards//mean": 0.74365234375, + "rewards//std": 0.029144570231437683, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4084, + "grad_norm": 4.660781383514404, + "kl": 1.7242193538695574, + "learning_rate": 6.511849453752223e-07, + "loss": 0.1724, + "num_tokens": 17649568.0, + "reward": 0.7921142578125, + "reward_std": 0.013369088061153889, + "rewards//mean": 0.7921142578125, + "rewards//std": 0.02901938557624817, + "step": 2042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4086, + "grad_norm": 2.364365816116333, + "kl": 1.3920183926820755, + "learning_rate": 6.50882436482303e-07, + "loss": 0.1392, + "num_tokens": 17658168.0, + "reward": 0.7457275390625, + "reward_std": 0.010490438900887966, + "rewards//mean": 0.7457275390625, + "rewards//std": 0.029386386275291443, + "step": 2043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4088, + "grad_norm": 3.800461769104004, + "kl": 1.5025834757834673, + "learning_rate": 6.505798668139563e-07, + "loss": 0.1503, + "num_tokens": 17666840.0, + "reward": 0.72894287109375, + "reward_std": 0.0077153644524514675, + "rewards//mean": 0.72894287109375, + "rewards//std": 0.04001575708389282, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.409, + "grad_norm": 2.1188666820526123, + "kl": 1.90363715775311, + "learning_rate": 6.502772364920573e-07, + "loss": 0.1904, + "num_tokens": 17675568.0, + "reward": 0.76446533203125, + "reward_std": 0.014138279482722282, + "rewards//mean": 0.76446533203125, + "rewards//std": 0.025842614471912384, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4092, + "grad_norm": 2.9869496822357178, + "kl": 1.4818255696445704, + "learning_rate": 6.499745456385053e-07, + "loss": 0.1482, + "num_tokens": 17684184.0, + "reward": 0.75079345703125, + "reward_std": 0.007513178512454033, + "rewards//mean": 0.75079345703125, + "rewards//std": 0.032735198736190796, + "step": 2046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4094, + "grad_norm": 3.033055543899536, + "kl": 0.9494079537689686, + "learning_rate": 6.496717943752243e-07, + "loss": 0.0949, + "num_tokens": 17692816.0, + "reward": 0.78106689453125, + "reward_std": 0.008452807553112507, + "rewards//mean": 0.78106689453125, + "rewards//std": 0.027869535610079765, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4096, + "grad_norm": 2.536658525466919, + "kl": 0.9353618137538433, + "learning_rate": 6.493689828241624e-07, + "loss": 0.0935, + "num_tokens": 17701504.0, + "reward": 0.7774658203125, + "reward_std": 0.004981533158570528, + "rewards//mean": 0.7774658203125, + "rewards//std": 0.016307169571518898, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4098, + "grad_norm": 2.1349055767059326, + "kl": 1.1380288396030664, + "learning_rate": 6.490661111072922e-07, + "loss": 0.1138, + "num_tokens": 17710096.0, + "reward": 0.75048828125, + "reward_std": 0.0110672852024436, + "rewards//mean": 0.75048828125, + "rewards//std": 0.03176885098218918, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.41, + "grad_norm": 2.831186294555664, + "kl": 1.24269063398242, + "learning_rate": 6.487631793466103e-07, + "loss": 0.1243, + "num_tokens": 17718712.0, + "reward": 0.76141357421875, + "reward_std": 0.010870445519685745, + "rewards//mean": 0.76141357421875, + "rewards//std": 0.026248294860124588, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4102, + "grad_norm": 4.94138240814209, + "kl": 0.9762107525020838, + "learning_rate": 6.484601876641375e-07, + "loss": 0.0976, + "num_tokens": 17727336.0, + "reward": 0.76458740234375, + "reward_std": 0.009395531378686428, + "rewards//mean": 0.76458740234375, + "rewards//std": 0.020066358149051666, + "step": 2051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4104, + "grad_norm": 8.553571701049805, + "kl": 0.691471990197897, + "learning_rate": 6.481571361819188e-07, + "loss": 0.0691, + "num_tokens": 17735816.0, + "reward": 0.7733154296875, + "reward_std": 0.006250659003853798, + "rewards//mean": 0.7733154296875, + "rewards//std": 0.025286376476287842, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4106, + "grad_norm": 4.581208229064941, + "kl": 1.3401698898524046, + "learning_rate": 6.478540250220233e-07, + "loss": 0.134, + "num_tokens": 17744416.0, + "reward": 0.75860595703125, + "reward_std": 0.012517396360635757, + "rewards//mean": 0.75860595703125, + "rewards//std": 0.02889149822294712, + "step": 2053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4108, + "grad_norm": 2.1354095935821533, + "kl": 1.1008250955492258, + "learning_rate": 6.475508543065445e-07, + "loss": 0.1101, + "num_tokens": 17753072.0, + "reward": 0.7401123046875, + "reward_std": 0.006451470777392387, + "rewards//mean": 0.7401123046875, + "rewards//std": 0.019881151616573334, + "step": 2054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.411, + "grad_norm": 3.631265878677368, + "kl": 0.7547381389886141, + "learning_rate": 6.472476241575988e-07, + "loss": 0.0755, + "num_tokens": 17761640.0, + "reward": 0.7266845703125, + "reward_std": 0.005720584653317928, + "rewards//mean": 0.7266845703125, + "rewards//std": 0.038089118897914886, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4112, + "grad_norm": 1.2131834030151367, + "kl": 0.9052568338811398, + "learning_rate": 6.46944334697328e-07, + "loss": 0.0905, + "num_tokens": 17770208.0, + "reward": 0.7615966796875, + "reward_std": 0.002265487564727664, + "rewards//mean": 0.7615966796875, + "rewards//std": 0.026502331718802452, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4114, + "grad_norm": 1.8009181022644043, + "kl": 1.1845761258155107, + "learning_rate": 6.466409860478966e-07, + "loss": 0.1185, + "num_tokens": 17778824.0, + "reward": 0.73486328125, + "reward_std": 0.007122317794710398, + "rewards//mean": 0.73486328125, + "rewards//std": 0.03841540217399597, + "step": 2057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4116, + "grad_norm": 3.887512683868408, + "kl": 1.0132326427847147, + "learning_rate": 6.463375783314938e-07, + "loss": 0.1013, + "num_tokens": 17787488.0, + "reward": 0.75006103515625, + "reward_std": 0.0051925163716077805, + "rewards//mean": 0.75006103515625, + "rewards//std": 0.03622167930006981, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4118, + "grad_norm": 2.5849311351776123, + "kl": 1.0403974521905184, + "learning_rate": 6.460341116703316e-07, + "loss": 0.104, + "num_tokens": 17796080.0, + "reward": 0.71124267578125, + "reward_std": 0.0045884703285992146, + "rewards//mean": 0.71124267578125, + "rewards//std": 0.025801578536629677, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.412, + "grad_norm": 2.037450075149536, + "kl": 0.8995330817997456, + "learning_rate": 6.45730586186647e-07, + "loss": 0.09, + "num_tokens": 17804696.0, + "reward": 0.77142333984375, + "reward_std": 0.006435304414480925, + "rewards//mean": 0.77142333984375, + "rewards//std": 0.0335378460586071, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4122, + "grad_norm": 9.42231559753418, + "kl": 2.2323528807610273, + "learning_rate": 6.454270020026995e-07, + "loss": 0.2232, + "num_tokens": 17813304.0, + "reward": 0.75457763671875, + "reward_std": 0.009538453072309494, + "rewards//mean": 0.75457763671875, + "rewards//std": 0.04063403233885765, + "step": 2061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4124, + "grad_norm": 2.6729183197021484, + "kl": 2.209277592599392, + "learning_rate": 6.451233592407731e-07, + "loss": 0.2209, + "num_tokens": 17821928.0, + "reward": 0.76422119140625, + "reward_std": 0.016325339674949646, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.029727671295404434, + "step": 2062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4126, + "grad_norm": 4.329207897186279, + "kl": 1.6514078807085752, + "learning_rate": 6.448196580231748e-07, + "loss": 0.1651, + "num_tokens": 17830640.0, + "reward": 0.7615966796875, + "reward_std": 0.00783233717083931, + "rewards//mean": 0.7615966796875, + "rewards//std": 0.028509967029094696, + "step": 2063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4128, + "grad_norm": 1.5277107954025269, + "kl": 1.75281697884202, + "learning_rate": 6.445158984722358e-07, + "loss": 0.1753, + "num_tokens": 17839288.0, + "reward": 0.78521728515625, + "reward_std": 0.012233024463057518, + "rewards//mean": 0.78521728515625, + "rewards//std": 0.030133292078971863, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.413, + "grad_norm": 1.8005568981170654, + "kl": 1.6472182553261518, + "learning_rate": 6.442120807103101e-07, + "loss": 0.1647, + "num_tokens": 17847952.0, + "reward": 0.77178955078125, + "reward_std": 0.014900758862495422, + "rewards//mean": 0.77178955078125, + "rewards//std": 0.034356992691755295, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4132, + "grad_norm": 2.6313374042510986, + "kl": 1.8411420416086912, + "learning_rate": 6.439082048597755e-07, + "loss": 0.1841, + "num_tokens": 17856624.0, + "reward": 0.77581787109375, + "reward_std": 0.006978172343224287, + "rewards//mean": 0.77581787109375, + "rewards//std": 0.01901751570403576, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4134, + "grad_norm": 6.2275848388671875, + "kl": 2.4125122260302305, + "learning_rate": 6.436042710430332e-07, + "loss": 0.2413, + "num_tokens": 17865360.0, + "reward": 0.771728515625, + "reward_std": 0.010828404687345028, + "rewards//mean": 0.771728515625, + "rewards//std": 0.031218014657497406, + "step": 2067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4136, + "grad_norm": 5.33842134475708, + "kl": 2.7863558419048786, + "learning_rate": 6.433002793825075e-07, + "loss": 0.2786, + "num_tokens": 17874080.0, + "reward": 0.739501953125, + "reward_std": 0.010757556185126305, + "rewards//mean": 0.739501953125, + "rewards//std": 0.039401277899742126, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4138, + "grad_norm": 3.7492501735687256, + "kl": 1.13406933657825, + "learning_rate": 6.429962300006467e-07, + "loss": 0.1134, + "num_tokens": 17882720.0, + "reward": 0.75250244140625, + "reward_std": 0.010993307456374168, + "rewards//mean": 0.75250244140625, + "rewards//std": 0.0225459486246109, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.414, + "grad_norm": 1.875473141670227, + "kl": 0.8898656964302063, + "learning_rate": 6.426921230199214e-07, + "loss": 0.089, + "num_tokens": 17891352.0, + "reward": 0.77154541015625, + "reward_std": 0.007440659683197737, + "rewards//mean": 0.77154541015625, + "rewards//std": 0.026401827111840248, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4142, + "grad_norm": 1.8110045194625854, + "kl": 1.4920458272099495, + "learning_rate": 6.423879585628261e-07, + "loss": 0.1492, + "num_tokens": 17899984.0, + "reward": 0.7764892578125, + "reward_std": 0.009123200550675392, + "rewards//mean": 0.7764892578125, + "rewards//std": 0.026914941146969795, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4144, + "grad_norm": 7.973541259765625, + "kl": 2.149272009730339, + "learning_rate": 6.420837367518779e-07, + "loss": 0.2149, + "num_tokens": 17908720.0, + "reward": 0.76470947265625, + "reward_std": 0.012989547103643417, + "rewards//mean": 0.76470947265625, + "rewards//std": 0.04894142225384712, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4146, + "grad_norm": 1.169753074645996, + "kl": 0.6107358522713184, + "learning_rate": 6.417794577096178e-07, + "loss": 0.0611, + "num_tokens": 17917408.0, + "reward": 0.77362060546875, + "reward_std": 0.0035788617096841335, + "rewards//mean": 0.77362060546875, + "rewards//std": 0.020260799676179886, + "step": 2073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4148, + "grad_norm": 2.611766815185547, + "kl": 2.183673534542322, + "learning_rate": 6.414751215586089e-07, + "loss": 0.2184, + "num_tokens": 17926120.0, + "reward": 0.74560546875, + "reward_std": 0.009498022496700287, + "rewards//mean": 0.74560546875, + "rewards//std": 0.02878500521183014, + "step": 2074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.415, + "grad_norm": 8.240768432617188, + "kl": 2.1809114646166563, + "learning_rate": 6.411707284214383e-07, + "loss": 0.2181, + "num_tokens": 17934728.0, + "reward": 0.75390625, + "reward_std": 0.010094861499965191, + "rewards//mean": 0.75390625, + "rewards//std": 0.03924807161092758, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4152, + "grad_norm": 0.47429579496383667, + "kl": 0.4447198919951916, + "learning_rate": 6.408662784207149e-07, + "loss": 0.0445, + "num_tokens": 17943344.0, + "reward": 0.7816162109375, + "reward_std": 0.0011425944976508617, + "rewards//mean": 0.7816162109375, + "rewards//std": 0.026736624538898468, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4154, + "grad_norm": 2.3489840030670166, + "kl": 0.9940777625888586, + "learning_rate": 6.405617716790714e-07, + "loss": 0.0994, + "num_tokens": 17951968.0, + "reward": 0.780029296875, + "reward_std": 0.010559500195086002, + "rewards//mean": 0.780029296875, + "rewards//std": 0.020313872024416924, + "step": 2077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4156, + "grad_norm": 1.765203595161438, + "kl": 1.495544370263815, + "learning_rate": 6.402572083191631e-07, + "loss": 0.1496, + "num_tokens": 17960616.0, + "reward": 0.78363037109375, + "reward_std": 0.01145388837903738, + "rewards//mean": 0.78363037109375, + "rewards//std": 0.03647571802139282, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4158, + "grad_norm": 4.124813556671143, + "kl": 1.8481785282492638, + "learning_rate": 6.39952588463668e-07, + "loss": 0.1848, + "num_tokens": 17969272.0, + "reward": 0.78070068359375, + "reward_std": 0.013623334467411041, + "rewards//mean": 0.78070068359375, + "rewards//std": 0.02973989024758339, + "step": 2079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.416, + "grad_norm": 1.7887225151062012, + "kl": 1.1756710764020681, + "learning_rate": 6.396479122352872e-07, + "loss": 0.1176, + "num_tokens": 17977864.0, + "reward": 0.73443603515625, + "reward_std": 0.005166611168533564, + "rewards//mean": 0.73443603515625, + "rewards//std": 0.03100483864545822, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4162, + "grad_norm": 2.4726037979125977, + "kl": 2.1253573801368475, + "learning_rate": 6.393431797567439e-07, + "loss": 0.2125, + "num_tokens": 17986496.0, + "reward": 0.7296142578125, + "reward_std": 0.013269955292344093, + "rewards//mean": 0.7296142578125, + "rewards//std": 0.026166634634137154, + "step": 2081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4164, + "grad_norm": 9.156553268432617, + "kl": 1.1292007397860289, + "learning_rate": 6.390383911507844e-07, + "loss": 0.1129, + "num_tokens": 17995152.0, + "reward": 0.77935791015625, + "reward_std": 0.009170042350888252, + "rewards//mean": 0.77935791015625, + "rewards//std": 0.026593778282403946, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4166, + "grad_norm": 4.945788383483887, + "kl": 2.0676492489874363, + "learning_rate": 6.387335465401776e-07, + "loss": 0.2068, + "num_tokens": 18003792.0, + "reward": 0.759765625, + "reward_std": 0.011839861050248146, + "rewards//mean": 0.759765625, + "rewards//std": 0.03657109662890434, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4168, + "grad_norm": 2.1150784492492676, + "kl": 1.9461730364710093, + "learning_rate": 6.384286460477149e-07, + "loss": 0.1946, + "num_tokens": 18012512.0, + "reward": 0.76318359375, + "reward_std": 0.00895975437015295, + "rewards//mean": 0.76318359375, + "rewards//std": 0.027423353865742683, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.417, + "grad_norm": 2.1498005390167236, + "kl": 1.535774925723672, + "learning_rate": 6.381236897962102e-07, + "loss": 0.1536, + "num_tokens": 18021112.0, + "reward": 0.717529296875, + "reward_std": 0.011439401656389236, + "rewards//mean": 0.717529296875, + "rewards//std": 0.03852479159832001, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4172, + "grad_norm": 7.3897175788879395, + "kl": 2.0761629343032837, + "learning_rate": 6.378186779084995e-07, + "loss": 0.2076, + "num_tokens": 18029760.0, + "reward": 0.7542724609375, + "reward_std": 0.016885977238416672, + "rewards//mean": 0.7542724609375, + "rewards//std": 0.04063500836491585, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4174, + "grad_norm": 4.471107482910156, + "kl": 1.6315892823040485, + "learning_rate": 6.375136105074422e-07, + "loss": 0.1632, + "num_tokens": 18038392.0, + "reward": 0.734619140625, + "reward_std": 0.012692532502114773, + "rewards//mean": 0.734619140625, + "rewards//std": 0.0459875650703907, + "step": 2087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4176, + "grad_norm": 2.5719826221466064, + "kl": 1.7751114573329687, + "learning_rate": 6.372084877159187e-07, + "loss": 0.1775, + "num_tokens": 18047040.0, + "reward": 0.72528076171875, + "reward_std": 0.01345803588628769, + "rewards//mean": 0.72528076171875, + "rewards//std": 0.039842501282691956, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4178, + "grad_norm": 2.3328065872192383, + "kl": 0.8251436147838831, + "learning_rate": 6.369033096568329e-07, + "loss": 0.0825, + "num_tokens": 18055672.0, + "reward": 0.7811279296875, + "reward_std": 0.0090356869623065, + "rewards//mean": 0.7811279296875, + "rewards//std": 0.03016122244298458, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.418, + "grad_norm": 3.3252112865448, + "kl": 2.387173403054476, + "learning_rate": 6.365980764531105e-07, + "loss": 0.2387, + "num_tokens": 18064424.0, + "reward": 0.7501220703125, + "reward_std": 0.0104212062433362, + "rewards//mean": 0.7501220703125, + "rewards//std": 0.0264840479940176, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4182, + "grad_norm": 2.9021358489990234, + "kl": 1.2990241348743439, + "learning_rate": 6.362927882276989e-07, + "loss": 0.1299, + "num_tokens": 18073040.0, + "reward": 0.7509765625, + "reward_std": 0.011548900976777077, + "rewards//mean": 0.7509765625, + "rewards//std": 0.050974052399396896, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4184, + "grad_norm": 2.0560474395751953, + "kl": 1.5595810990780592, + "learning_rate": 6.359874451035687e-07, + "loss": 0.156, + "num_tokens": 18081680.0, + "reward": 0.7305908203125, + "reward_std": 0.00845024548470974, + "rewards//mean": 0.7305908203125, + "rewards//std": 0.03647805005311966, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4186, + "grad_norm": 5.574970245361328, + "kl": 1.2170843351632357, + "learning_rate": 6.356820472037118e-07, + "loss": 0.1217, + "num_tokens": 18090248.0, + "reward": 0.7337646484375, + "reward_std": 0.009080907329916954, + "rewards//mean": 0.7337646484375, + "rewards//std": 0.03919384628534317, + "step": 2093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4188, + "grad_norm": 15.680562973022461, + "kl": 1.8032910265028477, + "learning_rate": 6.353765946511427e-07, + "loss": 0.1803, + "num_tokens": 18098880.0, + "reward": 0.744873046875, + "reward_std": 0.010085179470479488, + "rewards//mean": 0.744873046875, + "rewards//std": 0.03532378003001213, + "step": 2094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.419, + "grad_norm": 5.354268550872803, + "kl": 1.120858358219266, + "learning_rate": 6.350710875688972e-07, + "loss": 0.1121, + "num_tokens": 18107504.0, + "reward": 0.76904296875, + "reward_std": 0.011052628047764301, + "rewards//mean": 0.76904296875, + "rewards//std": 0.024334879592061043, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4192, + "grad_norm": 1.570924997329712, + "kl": 0.8186905924230814, + "learning_rate": 6.34765526080034e-07, + "loss": 0.0819, + "num_tokens": 18116192.0, + "reward": 0.75677490234375, + "reward_std": 0.005442342720925808, + "rewards//mean": 0.75677490234375, + "rewards//std": 0.03152964636683464, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4194, + "grad_norm": 9.793559074401855, + "kl": 2.4173106756061316, + "learning_rate": 6.344599103076328e-07, + "loss": 0.2417, + "num_tokens": 18124792.0, + "reward": 0.741943359375, + "reward_std": 0.011938218027353287, + "rewards//mean": 0.741943359375, + "rewards//std": 0.04015025123953819, + "step": 2097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4196, + "grad_norm": 2.039402961730957, + "kl": 0.8308284021914005, + "learning_rate": 6.341542403747959e-07, + "loss": 0.0831, + "num_tokens": 18133496.0, + "reward": 0.7730712890625, + "reward_std": 0.005864838138222694, + "rewards//mean": 0.7730712890625, + "rewards//std": 0.030708342790603638, + "step": 2098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4198, + "grad_norm": 1.8466956615447998, + "kl": 2.1186951771378517, + "learning_rate": 6.338485164046471e-07, + "loss": 0.2119, + "num_tokens": 18142104.0, + "reward": 0.74798583984375, + "reward_std": 0.015523204579949379, + "rewards//mean": 0.74798583984375, + "rewards//std": 0.03592962771654129, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.42, + "grad_norm": 6.199488639831543, + "kl": 3.4858515933156013, + "learning_rate": 6.335427385203319e-07, + "loss": 0.3486, + "num_tokens": 18150664.0, + "reward": 0.77685546875, + "reward_std": 0.025380093604326248, + "rewards//mean": 0.77685546875, + "rewards//std": 0.040639664977788925, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4202, + "grad_norm": 0.9254546165466309, + "kl": 0.6357361879199743, + "learning_rate": 6.332369068450174e-07, + "loss": 0.0636, + "num_tokens": 18159216.0, + "reward": 0.7769775390625, + "reward_std": 0.0016999198123812675, + "rewards//mean": 0.7769775390625, + "rewards//std": 0.028010739013552666, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4204, + "grad_norm": 7.393324375152588, + "kl": 1.3986777402460575, + "learning_rate": 6.329310215018931e-07, + "loss": 0.1399, + "num_tokens": 18167848.0, + "reward": 0.7493896484375, + "reward_std": 0.005857095587998629, + "rewards//mean": 0.7493896484375, + "rewards//std": 0.03305210545659065, + "step": 2102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4206, + "grad_norm": 7.460031986236572, + "kl": 2.483589192852378, + "learning_rate": 6.326250826141688e-07, + "loss": 0.2484, + "num_tokens": 18176528.0, + "reward": 0.77239990234375, + "reward_std": 0.012567653320729733, + "rewards//mean": 0.77239990234375, + "rewards//std": 0.03150659054517746, + "step": 2103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4208, + "grad_norm": 4.156373977661133, + "kl": 1.806695057079196, + "learning_rate": 6.323190903050774e-07, + "loss": 0.1807, + "num_tokens": 18185208.0, + "reward": 0.7320556640625, + "reward_std": 0.010320713743567467, + "rewards//mean": 0.7320556640625, + "rewards//std": 0.027750123292207718, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.421, + "grad_norm": 5.214327812194824, + "kl": 1.683013927191496, + "learning_rate": 6.320130446978722e-07, + "loss": 0.1683, + "num_tokens": 18193968.0, + "reward": 0.7216796875, + "reward_std": 0.008354991674423218, + "rewards//mean": 0.7216796875, + "rewards//std": 0.022617090493440628, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4212, + "grad_norm": 9.097914695739746, + "kl": 1.810871236026287, + "learning_rate": 6.317069459158282e-07, + "loss": 0.1811, + "num_tokens": 18202592.0, + "reward": 0.742919921875, + "reward_std": 0.01519353874027729, + "rewards//mean": 0.742919921875, + "rewards//std": 0.033988937735557556, + "step": 2106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4214, + "grad_norm": 5.557891845703125, + "kl": 1.9505478795617819, + "learning_rate": 6.314007940822425e-07, + "loss": 0.1951, + "num_tokens": 18211288.0, + "reward": 0.74090576171875, + "reward_std": 0.013918284326791763, + "rewards//mean": 0.74090576171875, + "rewards//std": 0.021732978522777557, + "step": 2107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4216, + "grad_norm": 7.896033763885498, + "kl": 2.5085130874067545, + "learning_rate": 6.310945893204324e-07, + "loss": 0.2509, + "num_tokens": 18219952.0, + "reward": 0.7344970703125, + "reward_std": 0.010494373738765717, + "rewards//mean": 0.7344970703125, + "rewards//std": 0.03119545802474022, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4218, + "grad_norm": 4.950935363769531, + "kl": 1.2270944006741047, + "learning_rate": 6.307883317537374e-07, + "loss": 0.1227, + "num_tokens": 18228664.0, + "reward": 0.7900390625, + "reward_std": 0.009227382019162178, + "rewards//mean": 0.7900390625, + "rewards//std": 0.02851872518658638, + "step": 2109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.422, + "grad_norm": 6.613138675689697, + "kl": 1.7463128045201302, + "learning_rate": 6.30482021505518e-07, + "loss": 0.1746, + "num_tokens": 18237256.0, + "reward": 0.74322509765625, + "reward_std": 0.0102263567969203, + "rewards//mean": 0.74322509765625, + "rewards//std": 0.019945291802287102, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4222, + "grad_norm": 5.39357328414917, + "kl": 1.523748204112053, + "learning_rate": 6.30175658699156e-07, + "loss": 0.1524, + "num_tokens": 18245912.0, + "reward": 0.763916015625, + "reward_std": 0.01314021646976471, + "rewards//mean": 0.763916015625, + "rewards//std": 0.028636319562792778, + "step": 2111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4224, + "grad_norm": 11.160625457763672, + "kl": 2.8511134013533592, + "learning_rate": 6.298692434580542e-07, + "loss": 0.2851, + "num_tokens": 18254528.0, + "reward": 0.75970458984375, + "reward_std": 0.008576745167374611, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.031141243875026703, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4226, + "grad_norm": 8.853973388671875, + "kl": 2.4117111582309008, + "learning_rate": 6.295627759056368e-07, + "loss": 0.2412, + "num_tokens": 18263104.0, + "reward": 0.75775146484375, + "reward_std": 0.013864162378013134, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.03338221460580826, + "step": 2113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4228, + "grad_norm": 23.342029571533203, + "kl": 3.7847764994949102, + "learning_rate": 6.292562561653485e-07, + "loss": 0.3785, + "num_tokens": 18271784.0, + "reward": 0.75518798828125, + "reward_std": 0.016883403062820435, + "rewards//mean": 0.75518798828125, + "rewards//std": 0.038222964853048325, + "step": 2114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.423, + "grad_norm": 7.432125568389893, + "kl": 0.8695875890552998, + "learning_rate": 6.289496843606559e-07, + "loss": 0.087, + "num_tokens": 18280512.0, + "reward": 0.7508544921875, + "reward_std": 0.00664735259488225, + "rewards//mean": 0.7508544921875, + "rewards//std": 0.03417791798710823, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4232, + "grad_norm": 11.224912643432617, + "kl": 2.320872815325856, + "learning_rate": 6.286430606150458e-07, + "loss": 0.2321, + "num_tokens": 18289144.0, + "reward": 0.7442626953125, + "reward_std": 0.014016811735928059, + "rewards//mean": 0.7442626953125, + "rewards//std": 0.03870101273059845, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4234, + "grad_norm": 20.115665435791016, + "kl": 3.0253824815154076, + "learning_rate": 6.283363850520263e-07, + "loss": 0.3025, + "num_tokens": 18297800.0, + "reward": 0.7283935546875, + "reward_std": 0.02322789654135704, + "rewards//mean": 0.7283935546875, + "rewards//std": 0.04862479493021965, + "step": 2117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4236, + "grad_norm": 5.687526226043701, + "kl": 1.3542177956551313, + "learning_rate": 6.280296577951261e-07, + "loss": 0.1354, + "num_tokens": 18306448.0, + "reward": 0.77752685546875, + "reward_std": 0.006076337769627571, + "rewards//mean": 0.77752685546875, + "rewards//std": 0.029939260333776474, + "step": 2118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4238, + "grad_norm": 13.594491004943848, + "kl": 2.8181651029735804, + "learning_rate": 6.277228789678953e-07, + "loss": 0.2818, + "num_tokens": 18315168.0, + "reward": 0.7254638671875, + "reward_std": 0.011199472472071648, + "rewards//mean": 0.7254638671875, + "rewards//std": 0.02932657115161419, + "step": 2119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.424, + "grad_norm": 12.44481372833252, + "kl": 2.0713940542191267, + "learning_rate": 6.27416048693904e-07, + "loss": 0.2071, + "num_tokens": 18323840.0, + "reward": 0.7388916015625, + "reward_std": 0.015948716551065445, + "rewards//mean": 0.7388916015625, + "rewards//std": 0.041481517255306244, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4242, + "grad_norm": 7.4084296226501465, + "kl": 1.8915503304451704, + "learning_rate": 6.271091670967436e-07, + "loss": 0.1892, + "num_tokens": 18332536.0, + "reward": 0.78314208984375, + "reward_std": 0.0176948644220829, + "rewards//mean": 0.78314208984375, + "rewards//std": 0.036484844982624054, + "step": 2121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4244, + "grad_norm": 2.852416515350342, + "kl": 1.5264004413038492, + "learning_rate": 6.268022343000257e-07, + "loss": 0.1526, + "num_tokens": 18341096.0, + "reward": 0.7418212890625, + "reward_std": 0.01598222926259041, + "rewards//mean": 0.7418212890625, + "rewards//std": 0.03863681107759476, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4246, + "grad_norm": 3.3274168968200684, + "kl": 1.835201857611537, + "learning_rate": 6.26495250427383e-07, + "loss": 0.1835, + "num_tokens": 18349744.0, + "reward": 0.7379150390625, + "reward_std": 0.017215589061379433, + "rewards//mean": 0.7379150390625, + "rewards//std": 0.037056103348731995, + "step": 2123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4248, + "grad_norm": 2.3662467002868652, + "kl": 0.8203063625842333, + "learning_rate": 6.261882156024687e-07, + "loss": 0.082, + "num_tokens": 18358480.0, + "reward": 0.76251220703125, + "reward_std": 0.007508883252739906, + "rewards//mean": 0.76251220703125, + "rewards//std": 0.02467763051390648, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.425, + "grad_norm": 3.8568899631500244, + "kl": 1.821099154651165, + "learning_rate": 6.258811299489563e-07, + "loss": 0.1821, + "num_tokens": 18367160.0, + "reward": 0.73419189453125, + "reward_std": 0.01234118640422821, + "rewards//mean": 0.73419189453125, + "rewards//std": 0.030325084924697876, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4252, + "grad_norm": 3.9407923221588135, + "kl": 0.4495155494660139, + "learning_rate": 6.255739935905395e-07, + "loss": 0.045, + "num_tokens": 18375720.0, + "reward": 0.7685546875, + "reward_std": 0.002086243126541376, + "rewards//mean": 0.7685546875, + "rewards//std": 0.023561181500554085, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4254, + "grad_norm": 9.088201522827148, + "kl": 1.1373247932642698, + "learning_rate": 6.252668066509334e-07, + "loss": 0.1137, + "num_tokens": 18384392.0, + "reward": 0.7838134765625, + "reward_std": 0.009399846196174622, + "rewards//mean": 0.7838134765625, + "rewards//std": 0.03139088675379753, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4256, + "grad_norm": 4.97630500793457, + "kl": 1.3963549435138702, + "learning_rate": 6.249595692538725e-07, + "loss": 0.1396, + "num_tokens": 18393016.0, + "reward": 0.75299072265625, + "reward_std": 0.00955170951783657, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.025683382526040077, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4258, + "grad_norm": 4.449544429779053, + "kl": 1.6349581088870764, + "learning_rate": 6.24652281523112e-07, + "loss": 0.1635, + "num_tokens": 18401656.0, + "reward": 0.75506591796875, + "reward_std": 0.00892835482954979, + "rewards//mean": 0.75506591796875, + "rewards//std": 0.0331067331135273, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.426, + "grad_norm": 3.853914260864258, + "kl": 1.140592908486724, + "learning_rate": 6.243449435824276e-07, + "loss": 0.1141, + "num_tokens": 18410192.0, + "reward": 0.75897216796875, + "reward_std": 0.006809788756072521, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.022985469549894333, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4262, + "grad_norm": 4.499453544616699, + "kl": 1.0645976811647415, + "learning_rate": 6.240375555556145e-07, + "loss": 0.1065, + "num_tokens": 18418792.0, + "reward": 0.76025390625, + "reward_std": 0.011194022372364998, + "rewards//mean": 0.76025390625, + "rewards//std": 0.030405409634113312, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4264, + "grad_norm": 2.1362290382385254, + "kl": 0.797520499676466, + "learning_rate": 6.23730117566489e-07, + "loss": 0.0798, + "num_tokens": 18427328.0, + "reward": 0.775146484375, + "reward_std": 0.007097539957612753, + "rewards//mean": 0.775146484375, + "rewards//std": 0.027847325429320335, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4266, + "grad_norm": 11.005412101745605, + "kl": 2.1718028001487255, + "learning_rate": 6.234226297388868e-07, + "loss": 0.2172, + "num_tokens": 18436016.0, + "reward": 0.7471923828125, + "reward_std": 0.01807505264878273, + "rewards//mean": 0.7471923828125, + "rewards//std": 0.0366072952747345, + "step": 2133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4268, + "grad_norm": 4.533393859863281, + "kl": 0.9100767783820629, + "learning_rate": 6.231150921966642e-07, + "loss": 0.091, + "num_tokens": 18444720.0, + "reward": 0.77728271484375, + "reward_std": 0.011736341752111912, + "rewards//mean": 0.77728271484375, + "rewards//std": 0.043667614459991455, + "step": 2134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.427, + "grad_norm": 2.304311513900757, + "kl": 0.6403944510966539, + "learning_rate": 6.228075050636972e-07, + "loss": 0.064, + "num_tokens": 18453224.0, + "reward": 0.72186279296875, + "reward_std": 0.0018586714286357164, + "rewards//mean": 0.72186279296875, + "rewards//std": 0.024960633367300034, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4272, + "grad_norm": 8.2095947265625, + "kl": 1.7530698850750923, + "learning_rate": 6.22499868463882e-07, + "loss": 0.1753, + "num_tokens": 18461872.0, + "reward": 0.76837158203125, + "reward_std": 0.012861378490924835, + "rewards//mean": 0.76837158203125, + "rewards//std": 0.03546614199876785, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4274, + "grad_norm": 6.979181289672852, + "kl": 2.0072787310928106, + "learning_rate": 6.221921825211341e-07, + "loss": 0.2007, + "num_tokens": 18470576.0, + "reward": 0.74468994140625, + "reward_std": 0.010150300338864326, + "rewards//mean": 0.74468994140625, + "rewards//std": 0.0372038409113884, + "step": 2137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4276, + "grad_norm": 1.7577670812606812, + "kl": 0.8509061858057976, + "learning_rate": 6.2188444735939e-07, + "loss": 0.0851, + "num_tokens": 18479224.0, + "reward": 0.7550048828125, + "reward_std": 0.005741292145103216, + "rewards//mean": 0.7550048828125, + "rewards//std": 0.024134688079357147, + "step": 2138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4278, + "grad_norm": 4.912137508392334, + "kl": 1.3587144520133734, + "learning_rate": 6.215766631026049e-07, + "loss": 0.1359, + "num_tokens": 18487864.0, + "reward": 0.76092529296875, + "reward_std": 0.014840027317404747, + "rewards//mean": 0.76092529296875, + "rewards//std": 0.034556444734334946, + "step": 2139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.428, + "grad_norm": 3.154823064804077, + "kl": 1.7637139838188887, + "learning_rate": 6.212688298747545e-07, + "loss": 0.1764, + "num_tokens": 18496592.0, + "reward": 0.7587890625, + "reward_std": 0.01390785351395607, + "rewards//mean": 0.7587890625, + "rewards//std": 0.03728599473834038, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4282, + "grad_norm": 3.235072612762451, + "kl": 1.7916764188557863, + "learning_rate": 6.209609477998338e-07, + "loss": 0.1792, + "num_tokens": 18505272.0, + "reward": 0.7703857421875, + "reward_std": 0.011331296525895596, + "rewards//mean": 0.7703857421875, + "rewards//std": 0.02882678247988224, + "step": 2141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4284, + "grad_norm": 11.749098777770996, + "kl": 1.2385512199252844, + "learning_rate": 6.20653017001858e-07, + "loss": 0.1239, + "num_tokens": 18514000.0, + "reward": 0.7647705078125, + "reward_std": 0.01076004933565855, + "rewards//mean": 0.7647705078125, + "rewards//std": 0.04034039005637169, + "step": 2142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4286, + "grad_norm": 2.9059765338897705, + "kl": 1.418618032708764, + "learning_rate": 6.203450376048614e-07, + "loss": 0.1419, + "num_tokens": 18522616.0, + "reward": 0.7689208984375, + "reward_std": 0.013670215383172035, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.03702668100595474, + "step": 2143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4288, + "grad_norm": 5.828917026519775, + "kl": 1.170566013082862, + "learning_rate": 6.200370097328978e-07, + "loss": 0.1171, + "num_tokens": 18531128.0, + "reward": 0.75726318359375, + "reward_std": 0.012630200013518333, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.036972008645534515, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.429, + "grad_norm": 5.842690944671631, + "kl": 1.8484732639044523, + "learning_rate": 6.197289335100412e-07, + "loss": 0.1848, + "num_tokens": 18539808.0, + "reward": 0.71337890625, + "reward_std": 0.013507567346096039, + "rewards//mean": 0.71337890625, + "rewards//std": 0.037133026868104935, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4292, + "grad_norm": 11.032599449157715, + "kl": 1.3193820994347334, + "learning_rate": 6.194208090603844e-07, + "loss": 0.1319, + "num_tokens": 18548384.0, + "reward": 0.760498046875, + "reward_std": 0.009324944578111172, + "rewards//mean": 0.760498046875, + "rewards//std": 0.023774802684783936, + "step": 2146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4294, + "grad_norm": 4.721080780029297, + "kl": 2.2098822835832834, + "learning_rate": 6.191126365080401e-07, + "loss": 0.221, + "num_tokens": 18557016.0, + "reward": 0.77001953125, + "reward_std": 0.018213987350463867, + "rewards//mean": 0.77001953125, + "rewards//std": 0.03800337761640549, + "step": 2147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4296, + "grad_norm": 1.702780842781067, + "kl": 1.0110408384352922, + "learning_rate": 6.1880441597714e-07, + "loss": 0.1011, + "num_tokens": 18565536.0, + "reward": 0.777587890625, + "reward_std": 0.008701791055500507, + "rewards//mean": 0.777587890625, + "rewards//std": 0.030168499797582626, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4298, + "grad_norm": 9.587425231933594, + "kl": 2.704019645228982, + "learning_rate": 6.184961475918355e-07, + "loss": 0.2704, + "num_tokens": 18574376.0, + "reward": 0.6922607421875, + "reward_std": 0.007902892306447029, + "rewards//mean": 0.6922607421875, + "rewards//std": 0.0394417904317379, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.43, + "grad_norm": 3.4232468605041504, + "kl": 1.1936355344951153, + "learning_rate": 6.181878314762968e-07, + "loss": 0.1194, + "num_tokens": 18582968.0, + "reward": 0.74786376953125, + "reward_std": 0.010698895901441574, + "rewards//mean": 0.74786376953125, + "rewards//std": 0.03778881952166557, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4302, + "grad_norm": 4.0267157554626465, + "kl": 1.0333496294915676, + "learning_rate": 6.178794677547137e-07, + "loss": 0.1033, + "num_tokens": 18591656.0, + "reward": 0.770751953125, + "reward_std": 0.009181185625493526, + "rewards//mean": 0.770751953125, + "rewards//std": 0.018634876236319542, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4304, + "grad_norm": 7.141615867614746, + "kl": 1.5003833770751953, + "learning_rate": 6.17571056551295e-07, + "loss": 0.15, + "num_tokens": 18600320.0, + "reward": 0.74786376953125, + "reward_std": 0.010793449357151985, + "rewards//mean": 0.74786376953125, + "rewards//std": 0.030072445049881935, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4306, + "grad_norm": 2.650692939758301, + "kl": 1.588726183399558, + "learning_rate": 6.172625979902689e-07, + "loss": 0.1589, + "num_tokens": 18608952.0, + "reward": 0.76263427734375, + "reward_std": 0.01217411644756794, + "rewards//mean": 0.76263427734375, + "rewards//std": 0.0297876987606287, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4308, + "grad_norm": 3.7866313457489014, + "kl": 1.5632145144045353, + "learning_rate": 6.169540921958822e-07, + "loss": 0.1563, + "num_tokens": 18617552.0, + "reward": 0.7578125, + "reward_std": 0.013520997017621994, + "rewards//mean": 0.7578125, + "rewards//std": 0.03616484999656677, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.431, + "grad_norm": 4.070205211639404, + "kl": 1.0723882094025612, + "learning_rate": 6.166455392924014e-07, + "loss": 0.1072, + "num_tokens": 18626176.0, + "reward": 0.76544189453125, + "reward_std": 0.011572964489459991, + "rewards//mean": 0.76544189453125, + "rewards//std": 0.031237827613949776, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4312, + "grad_norm": 2.9327125549316406, + "kl": 1.4537433069199324, + "learning_rate": 6.163369394041111e-07, + "loss": 0.1454, + "num_tokens": 18634896.0, + "reward": 0.7596435546875, + "reward_std": 0.01330474391579628, + "rewards//mean": 0.7596435546875, + "rewards//std": 0.03291809931397438, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4314, + "grad_norm": 13.11862850189209, + "kl": 1.117951923981309, + "learning_rate": 6.160282926553158e-07, + "loss": 0.1118, + "num_tokens": 18643544.0, + "reward": 0.73834228515625, + "reward_std": 0.0061152055859565735, + "rewards//mean": 0.73834228515625, + "rewards//std": 0.03605412319302559, + "step": 2157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4316, + "grad_norm": 9.808173179626465, + "kl": 1.606158809736371, + "learning_rate": 6.157195991703377e-07, + "loss": 0.1606, + "num_tokens": 18652136.0, + "reward": 0.77374267578125, + "reward_std": 0.006930619012564421, + "rewards//mean": 0.77374267578125, + "rewards//std": 0.030196022242307663, + "step": 2158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4318, + "grad_norm": 5.353233814239502, + "kl": 2.0104468278586864, + "learning_rate": 6.154108590735191e-07, + "loss": 0.201, + "num_tokens": 18660760.0, + "reward": 0.7772216796875, + "reward_std": 0.01761246658861637, + "rewards//mean": 0.7772216796875, + "rewards//std": 0.033414676785469055, + "step": 2159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.432, + "grad_norm": 10.758246421813965, + "kl": 1.1015125382691622, + "learning_rate": 6.151020724892204e-07, + "loss": 0.1102, + "num_tokens": 18669424.0, + "reward": 0.75531005859375, + "reward_std": 0.006562560796737671, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.03116748295724392, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4322, + "grad_norm": 10.276647567749023, + "kl": 0.792457576841116, + "learning_rate": 6.147932395418205e-07, + "loss": 0.0792, + "num_tokens": 18678072.0, + "reward": 0.74188232421875, + "reward_std": 0.005382788833230734, + "rewards//mean": 0.74188232421875, + "rewards//std": 0.017320360988378525, + "step": 2161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4324, + "grad_norm": 4.536101818084717, + "kl": 1.54203487560153, + "learning_rate": 6.144843603557175e-07, + "loss": 0.1542, + "num_tokens": 18686696.0, + "reward": 0.76177978515625, + "reward_std": 0.0062855747528374195, + "rewards//mean": 0.76177978515625, + "rewards//std": 0.035218145698308945, + "step": 2162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4326, + "grad_norm": 3.4061903953552246, + "kl": 1.8046353124082088, + "learning_rate": 6.141754350553279e-07, + "loss": 0.1805, + "num_tokens": 18695304.0, + "reward": 0.72412109375, + "reward_std": 0.011086277663707733, + "rewards//mean": 0.72412109375, + "rewards//std": 0.023882798850536346, + "step": 2163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4328, + "grad_norm": 3.208974838256836, + "kl": 1.723298080265522, + "learning_rate": 6.138664637650866e-07, + "loss": 0.1723, + "num_tokens": 18703928.0, + "reward": 0.7537841796875, + "reward_std": 0.014096071943640709, + "rewards//mean": 0.7537841796875, + "rewards//std": 0.03369618207216263, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.433, + "grad_norm": 8.98055362701416, + "kl": 1.96144974604249, + "learning_rate": 6.135574466094475e-07, + "loss": 0.1961, + "num_tokens": 18712608.0, + "reward": 0.75579833984375, + "reward_std": 0.010316908359527588, + "rewards//mean": 0.75579833984375, + "rewards//std": 0.03192469850182533, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4332, + "grad_norm": 1.1932562589645386, + "kl": 1.1197879947721958, + "learning_rate": 6.132483837128823e-07, + "loss": 0.112, + "num_tokens": 18721288.0, + "reward": 0.75421142578125, + "reward_std": 0.005633528344333172, + "rewards//mean": 0.75421142578125, + "rewards//std": 0.028067629784345627, + "step": 2166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4334, + "grad_norm": 3.0272440910339355, + "kl": 1.6767716985195875, + "learning_rate": 6.129392751998816e-07, + "loss": 0.1677, + "num_tokens": 18729968.0, + "reward": 0.7490234375, + "reward_std": 0.011375471949577332, + "rewards//mean": 0.7490234375, + "rewards//std": 0.032682035118341446, + "step": 2167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4336, + "grad_norm": 0.9035928249359131, + "kl": 0.753477955237031, + "learning_rate": 6.126301211949545e-07, + "loss": 0.0753, + "num_tokens": 18738616.0, + "reward": 0.77423095703125, + "reward_std": 0.003938069101423025, + "rewards//mean": 0.77423095703125, + "rewards//std": 0.022338872775435448, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4338, + "grad_norm": 3.4194743633270264, + "kl": 0.9865223374217749, + "learning_rate": 6.12320921822628e-07, + "loss": 0.0987, + "num_tokens": 18747304.0, + "reward": 0.7744140625, + "reward_std": 0.004792301915585995, + "rewards//mean": 0.7744140625, + "rewards//std": 0.017812522128224373, + "step": 2169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.434, + "grad_norm": 3.9922354221343994, + "kl": 1.0991936400532722, + "learning_rate": 6.120116772074477e-07, + "loss": 0.1099, + "num_tokens": 18755968.0, + "reward": 0.7884521484375, + "reward_std": 0.014512930065393448, + "rewards//mean": 0.7884521484375, + "rewards//std": 0.029718313366174698, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4342, + "grad_norm": 1.4978853464126587, + "kl": 0.5665574371814728, + "learning_rate": 6.117023874739771e-07, + "loss": 0.0567, + "num_tokens": 18764528.0, + "reward": 0.7288818359375, + "reward_std": 0.0035977151710540056, + "rewards//mean": 0.7288818359375, + "rewards//std": 0.03581300005316734, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4344, + "grad_norm": 2.9064717292785645, + "kl": 1.5892184115946293, + "learning_rate": 6.113930527467983e-07, + "loss": 0.1589, + "num_tokens": 18773128.0, + "reward": 0.71044921875, + "reward_std": 0.014908688142895699, + "rewards//mean": 0.71044921875, + "rewards//std": 0.03839017450809479, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4346, + "grad_norm": 3.196190357208252, + "kl": 1.3780927266925573, + "learning_rate": 6.110836731505111e-07, + "loss": 0.1378, + "num_tokens": 18781720.0, + "reward": 0.784423828125, + "reward_std": 0.01749286614358425, + "rewards//mean": 0.784423828125, + "rewards//std": 0.034861356019973755, + "step": 2173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4348, + "grad_norm": 5.666129112243652, + "kl": 2.0088922437280416, + "learning_rate": 6.107742488097338e-07, + "loss": 0.2009, + "num_tokens": 18790432.0, + "reward": 0.75592041015625, + "reward_std": 0.01673523336648941, + "rewards//mean": 0.75592041015625, + "rewards//std": 0.043464694172143936, + "step": 2174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.435, + "grad_norm": 0.8005154132843018, + "kl": 0.7844834979623556, + "learning_rate": 6.104647798491021e-07, + "loss": 0.0784, + "num_tokens": 18799040.0, + "reward": 0.718017578125, + "reward_std": 0.0040486156940460205, + "rewards//mean": 0.718017578125, + "rewards//std": 0.02830454148352146, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4352, + "grad_norm": 2.143218755722046, + "kl": 1.1363839954137802, + "learning_rate": 6.101552663932703e-07, + "loss": 0.1136, + "num_tokens": 18807592.0, + "reward": 0.75531005859375, + "reward_std": 0.009802170097827911, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.02792108803987503, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4354, + "grad_norm": 8.324600219726562, + "kl": 2.1878544725477695, + "learning_rate": 6.098457085669104e-07, + "loss": 0.2188, + "num_tokens": 18816256.0, + "reward": 0.7451171875, + "reward_std": 0.015088779851794243, + "rewards//mean": 0.7451171875, + "rewards//std": 0.04121077060699463, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4356, + "grad_norm": 20.794355392456055, + "kl": 3.1621410995721817, + "learning_rate": 6.095361064947123e-07, + "loss": 0.3162, + "num_tokens": 18825152.0, + "reward": 0.77301025390625, + "reward_std": 0.005038381554186344, + "rewards//mean": 0.77301025390625, + "rewards//std": 0.035441804677248, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4358, + "grad_norm": 33.44708251953125, + "kl": 0.9554375950247049, + "learning_rate": 6.092264603013836e-07, + "loss": 0.0955, + "num_tokens": 18833744.0, + "reward": 0.78448486328125, + "reward_std": 0.006282017100602388, + "rewards//mean": 0.78448486328125, + "rewards//std": 0.032313521951436996, + "step": 2179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.436, + "grad_norm": 3.3983073234558105, + "kl": 1.4822800997644663, + "learning_rate": 6.089167701116498e-07, + "loss": 0.1482, + "num_tokens": 18842312.0, + "reward": 0.77099609375, + "reward_std": 0.016543421894311905, + "rewards//mean": 0.77099609375, + "rewards//std": 0.035190682858228683, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4362, + "grad_norm": 2.9370338916778564, + "kl": 1.225297475233674, + "learning_rate": 6.086070360502539e-07, + "loss": 0.1225, + "num_tokens": 18850984.0, + "reward": 0.730712890625, + "reward_std": 0.008631674572825432, + "rewards//mean": 0.730712890625, + "rewards//std": 0.029813161119818687, + "step": 2181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4364, + "grad_norm": 5.979632377624512, + "kl": 1.627122599631548, + "learning_rate": 6.082972582419568e-07, + "loss": 0.1627, + "num_tokens": 18859696.0, + "reward": 0.74945068359375, + "reward_std": 0.016555923968553543, + "rewards//mean": 0.74945068359375, + "rewards//std": 0.034840453416109085, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4366, + "grad_norm": 1.5862843990325928, + "kl": 1.249754261225462, + "learning_rate": 6.079874368115373e-07, + "loss": 0.125, + "num_tokens": 18868368.0, + "reward": 0.74041748046875, + "reward_std": 0.009764362126588821, + "rewards//mean": 0.74041748046875, + "rewards//std": 0.03229384124279022, + "step": 2183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4368, + "grad_norm": 6.354976654052734, + "kl": 1.9152375888079405, + "learning_rate": 6.07677571883791e-07, + "loss": 0.1915, + "num_tokens": 18877112.0, + "reward": 0.7677001953125, + "reward_std": 0.009394929744303226, + "rewards//mean": 0.7677001953125, + "rewards//std": 0.030591784045100212, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.437, + "grad_norm": 6.0940260887146, + "kl": 0.7830645311623812, + "learning_rate": 6.073676635835316e-07, + "loss": 0.0783, + "num_tokens": 18885768.0, + "reward": 0.7586669921875, + "reward_std": 0.004875886719673872, + "rewards//mean": 0.7586669921875, + "rewards//std": 0.023493947461247444, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4372, + "grad_norm": 3.423069953918457, + "kl": 1.0215796921402216, + "learning_rate": 6.070577120355902e-07, + "loss": 0.1022, + "num_tokens": 18894440.0, + "reward": 0.7093505859375, + "reward_std": 0.004153335001319647, + "rewards//mean": 0.7093505859375, + "rewards//std": 0.03390220180153847, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4374, + "grad_norm": 4.663547992706299, + "kl": 2.1135014928877354, + "learning_rate": 6.067477173648152e-07, + "loss": 0.2114, + "num_tokens": 18903144.0, + "reward": 0.74072265625, + "reward_std": 0.014434357173740864, + "rewards//mean": 0.74072265625, + "rewards//std": 0.047442518174648285, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4376, + "grad_norm": 4.166323661804199, + "kl": 0.9639910086989403, + "learning_rate": 6.064376796960723e-07, + "loss": 0.0964, + "num_tokens": 18911760.0, + "reward": 0.75897216796875, + "reward_std": 0.0025895023718476295, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.024076521396636963, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4378, + "grad_norm": 2.9191033840179443, + "kl": 0.8231822419911623, + "learning_rate": 6.06127599154245e-07, + "loss": 0.0823, + "num_tokens": 18920392.0, + "reward": 0.75982666015625, + "reward_std": 0.004977487958967686, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.024289971217513084, + "step": 2189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.438, + "grad_norm": 4.337491989135742, + "kl": 2.085209859535098, + "learning_rate": 6.058174758642332e-07, + "loss": 0.2085, + "num_tokens": 18929032.0, + "reward": 0.74700927734375, + "reward_std": 0.00833970122039318, + "rewards//mean": 0.74700927734375, + "rewards//std": 0.026754947379231453, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4382, + "grad_norm": 5.553107738494873, + "kl": 1.5874354597181082, + "learning_rate": 6.055073099509549e-07, + "loss": 0.1587, + "num_tokens": 18937696.0, + "reward": 0.77398681640625, + "reward_std": 0.012309102341532707, + "rewards//mean": 0.77398681640625, + "rewards//std": 0.02603112906217575, + "step": 2191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4384, + "grad_norm": 3.687373638153076, + "kl": 1.5178766623139381, + "learning_rate": 6.051971015393446e-07, + "loss": 0.1518, + "num_tokens": 18946344.0, + "reward": 0.7655029296875, + "reward_std": 0.012161138467490673, + "rewards//mean": 0.7655029296875, + "rewards//std": 0.028125077486038208, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4386, + "grad_norm": 19.311620712280273, + "kl": 1.3726043608039618, + "learning_rate": 6.048868507543546e-07, + "loss": 0.1373, + "num_tokens": 18955040.0, + "reward": 0.731201171875, + "reward_std": 0.00678750965744257, + "rewards//mean": 0.731201171875, + "rewards//std": 0.028905706480145454, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4388, + "grad_norm": 2.079606294631958, + "kl": 0.832677049562335, + "learning_rate": 6.045765577209536e-07, + "loss": 0.0833, + "num_tokens": 18963656.0, + "reward": 0.74688720703125, + "reward_std": 0.005319797433912754, + "rewards//mean": 0.74688720703125, + "rewards//std": 0.027596617117524147, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.439, + "grad_norm": 10.224716186523438, + "kl": 2.7264945339411497, + "learning_rate": 6.042662225641276e-07, + "loss": 0.2726, + "num_tokens": 18972344.0, + "reward": 0.76007080078125, + "reward_std": 0.009468501433730125, + "rewards//mean": 0.76007080078125, + "rewards//std": 0.036754779517650604, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4392, + "grad_norm": 3.092094898223877, + "kl": 1.1290774885565042, + "learning_rate": 6.039558454088795e-07, + "loss": 0.1129, + "num_tokens": 18980904.0, + "reward": 0.789306640625, + "reward_std": 0.004676284734159708, + "rewards//mean": 0.789306640625, + "rewards//std": 0.020913109183311462, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4394, + "grad_norm": 8.797189712524414, + "kl": 2.7929136995226145, + "learning_rate": 6.036454263802297e-07, + "loss": 0.2793, + "num_tokens": 18989544.0, + "reward": 0.751220703125, + "reward_std": 0.009476939216256142, + "rewards//mean": 0.751220703125, + "rewards//std": 0.039345916360616684, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4396, + "grad_norm": 24.530960083007812, + "kl": 3.1615314800292253, + "learning_rate": 6.033349656032143e-07, + "loss": 0.3162, + "num_tokens": 18998184.0, + "reward": 0.7589111328125, + "reward_std": 0.014757446944713593, + "rewards//mean": 0.7589111328125, + "rewards//std": 0.03508016839623451, + "step": 2198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4398, + "grad_norm": 11.733026504516602, + "kl": 1.7158165480941534, + "learning_rate": 6.03024463202887e-07, + "loss": 0.1716, + "num_tokens": 19006832.0, + "reward": 0.7730712890625, + "reward_std": 0.005006241146475077, + "rewards//mean": 0.7730712890625, + "rewards//std": 0.026426827535033226, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.44, + "grad_norm": 14.778258323669434, + "kl": 2.6377467457205057, + "learning_rate": 6.027139193043183e-07, + "loss": 0.2638, + "num_tokens": 19015520.0, + "reward": 0.769287109375, + "reward_std": 0.015146201476454735, + "rewards//mean": 0.769287109375, + "rewards//std": 0.03670908510684967, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4402, + "grad_norm": 6.576761245727539, + "kl": 1.7896684668958187, + "learning_rate": 6.024033340325954e-07, + "loss": 0.179, + "num_tokens": 19024152.0, + "reward": 0.76422119140625, + "reward_std": 0.010269703343510628, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.03793235868215561, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4404, + "grad_norm": 6.309030532836914, + "kl": 1.3730927128344774, + "learning_rate": 6.020927075128216e-07, + "loss": 0.1373, + "num_tokens": 19032808.0, + "reward": 0.7271728515625, + "reward_std": 0.00622583320364356, + "rewards//mean": 0.7271728515625, + "rewards//std": 0.026822544634342194, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4406, + "grad_norm": 9.3938570022583, + "kl": 2.0689783580601215, + "learning_rate": 6.017820398701174e-07, + "loss": 0.2069, + "num_tokens": 19041512.0, + "reward": 0.75384521484375, + "reward_std": 0.014088593423366547, + "rewards//mean": 0.75384521484375, + "rewards//std": 0.038729310035705566, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4408, + "grad_norm": 19.620643615722656, + "kl": 1.9716646689921618, + "learning_rate": 6.014713312296198e-07, + "loss": 0.1972, + "num_tokens": 19050184.0, + "reward": 0.75457763671875, + "reward_std": 0.010911685414612293, + "rewards//mean": 0.75457763671875, + "rewards//std": 0.033922456204891205, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.441, + "grad_norm": 0.3292747139930725, + "kl": 0.43618749640882015, + "learning_rate": 6.011605817164821e-07, + "loss": 0.0436, + "num_tokens": 19058720.0, + "reward": 0.7493896484375, + "reward_std": 0.0005712973070330918, + "rewards//mean": 0.7493896484375, + "rewards//std": 0.026422245427966118, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4412, + "grad_norm": 7.184051990509033, + "kl": 2.175535971298814, + "learning_rate": 6.008497914558743e-07, + "loss": 0.2176, + "num_tokens": 19067424.0, + "reward": 0.757080078125, + "reward_std": 0.009729809127748013, + "rewards//mean": 0.757080078125, + "rewards//std": 0.03514505550265312, + "step": 2206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4414, + "grad_norm": 14.854913711547852, + "kl": 2.476182458922267, + "learning_rate": 6.005389605729824e-07, + "loss": 0.2476, + "num_tokens": 19076272.0, + "reward": 0.75335693359375, + "reward_std": 0.010970649309456348, + "rewards//mean": 0.75335693359375, + "rewards//std": 0.024238815531134605, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4416, + "grad_norm": 4.014355182647705, + "kl": 1.5123499017208815, + "learning_rate": 6.002280891930093e-07, + "loss": 0.1512, + "num_tokens": 19084864.0, + "reward": 0.72613525390625, + "reward_std": 0.010421417653560638, + "rewards//mean": 0.72613525390625, + "rewards//std": 0.027560940012335777, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4418, + "grad_norm": 6.3394389152526855, + "kl": 0.9982382394373417, + "learning_rate": 5.999171774411736e-07, + "loss": 0.0998, + "num_tokens": 19093552.0, + "reward": 0.754150390625, + "reward_std": 0.01038735918700695, + "rewards//mean": 0.754150390625, + "rewards//std": 0.02774275839328766, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.442, + "grad_norm": 8.232433319091797, + "kl": 1.4321093633770943, + "learning_rate": 5.996062254427111e-07, + "loss": 0.1432, + "num_tokens": 19102240.0, + "reward": 0.76446533203125, + "reward_std": 0.015326702035963535, + "rewards//mean": 0.76446533203125, + "rewards//std": 0.03654247149825096, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4422, + "grad_norm": 9.17300033569336, + "kl": 2.0993913877755404, + "learning_rate": 5.992952333228726e-07, + "loss": 0.2099, + "num_tokens": 19110888.0, + "reward": 0.73992919921875, + "reward_std": 0.020127736032009125, + "rewards//mean": 0.73992919921875, + "rewards//std": 0.03533700853586197, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4424, + "grad_norm": 6.191318035125732, + "kl": 1.4674593787640333, + "learning_rate": 5.989842012069264e-07, + "loss": 0.1467, + "num_tokens": 19119568.0, + "reward": 0.72418212890625, + "reward_std": 0.005664869211614132, + "rewards//mean": 0.72418212890625, + "rewards//std": 0.03747142478823662, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4426, + "grad_norm": 2.296954393386841, + "kl": 1.4854791834950447, + "learning_rate": 5.986731292201554e-07, + "loss": 0.1485, + "num_tokens": 19128184.0, + "reward": 0.7471923828125, + "reward_std": 0.014472205191850662, + "rewards//mean": 0.7471923828125, + "rewards//std": 0.035560186952352524, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4428, + "grad_norm": 4.560058116912842, + "kl": 0.8986778799444437, + "learning_rate": 5.983620174878601e-07, + "loss": 0.0899, + "num_tokens": 19136728.0, + "reward": 0.77490234375, + "reward_std": 0.003667796030640602, + "rewards//mean": 0.77490234375, + "rewards//std": 0.022266332060098648, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.443, + "grad_norm": 7.900885581970215, + "kl": 1.5524863712489605, + "learning_rate": 5.980508661353556e-07, + "loss": 0.1552, + "num_tokens": 19145344.0, + "reward": 0.74664306640625, + "reward_std": 0.012428708374500275, + "rewards//mean": 0.74664306640625, + "rewards//std": 0.03506911173462868, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4432, + "grad_norm": 4.232226848602295, + "kl": 1.097946371883154, + "learning_rate": 5.977396752879741e-07, + "loss": 0.1098, + "num_tokens": 19153904.0, + "reward": 0.74468994140625, + "reward_std": 0.0064017619006335735, + "rewards//mean": 0.74468994140625, + "rewards//std": 0.027912413701415062, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4434, + "grad_norm": 4.749003887176514, + "kl": 1.5166679192334414, + "learning_rate": 5.97428445071063e-07, + "loss": 0.1517, + "num_tokens": 19162560.0, + "reward": 0.75103759765625, + "reward_std": 0.010828148573637009, + "rewards//mean": 0.75103759765625, + "rewards//std": 0.03773951530456543, + "step": 2217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4436, + "grad_norm": 3.725642681121826, + "kl": 0.7739944774657488, + "learning_rate": 5.97117175609986e-07, + "loss": 0.0774, + "num_tokens": 19171104.0, + "reward": 0.7379150390625, + "reward_std": 0.003140170592814684, + "rewards//mean": 0.7379150390625, + "rewards//std": 0.023681342601776123, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4438, + "grad_norm": 9.205086708068848, + "kl": 1.5616345014423132, + "learning_rate": 5.968058670301221e-07, + "loss": 0.1562, + "num_tokens": 19179704.0, + "reward": 0.74267578125, + "reward_std": 0.009503135457634926, + "rewards//mean": 0.74267578125, + "rewards//std": 0.03642842918634415, + "step": 2219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.444, + "grad_norm": 3.7951974868774414, + "kl": 0.844998637214303, + "learning_rate": 5.964945194568668e-07, + "loss": 0.0845, + "num_tokens": 19188312.0, + "reward": 0.76922607421875, + "reward_std": 0.009402241557836533, + "rewards//mean": 0.76922607421875, + "rewards//std": 0.02161284349858761, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4442, + "grad_norm": 5.408355236053467, + "kl": 1.5079319383949041, + "learning_rate": 5.961831330156305e-07, + "loss": 0.1508, + "num_tokens": 19196944.0, + "reward": 0.760986328125, + "reward_std": 0.01352232787758112, + "rewards//mean": 0.760986328125, + "rewards//std": 0.026538006961345673, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4444, + "grad_norm": 2.451479911804199, + "kl": 0.8313476666808128, + "learning_rate": 5.958717078318396e-07, + "loss": 0.0831, + "num_tokens": 19205624.0, + "reward": 0.7874755859375, + "reward_std": 0.004800444468855858, + "rewards//mean": 0.7874755859375, + "rewards//std": 0.01885252632200718, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4446, + "grad_norm": 4.811601161956787, + "kl": 0.739175459370017, + "learning_rate": 5.955602440309365e-07, + "loss": 0.0739, + "num_tokens": 19214280.0, + "reward": 0.7564697265625, + "reward_std": 0.003658160800114274, + "rewards//mean": 0.7564697265625, + "rewards//std": 0.025451067835092545, + "step": 2223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4448, + "grad_norm": 3.0447213649749756, + "kl": 1.6058388724923134, + "learning_rate": 5.952487417383781e-07, + "loss": 0.1606, + "num_tokens": 19222880.0, + "reward": 0.7608642578125, + "reward_std": 0.012654721736907959, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.03328031301498413, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.445, + "grad_norm": 5.714029788970947, + "kl": 2.248139575123787, + "learning_rate": 5.949372010796383e-07, + "loss": 0.2248, + "num_tokens": 19231496.0, + "reward": 0.74993896484375, + "reward_std": 0.010007057338953018, + "rewards//mean": 0.74993896484375, + "rewards//std": 0.03187534958124161, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4452, + "grad_norm": 6.307797908782959, + "kl": 0.6014745999127626, + "learning_rate": 5.946256221802051e-07, + "loss": 0.0601, + "num_tokens": 19240144.0, + "reward": 0.7572021484375, + "reward_std": 0.0027377079240977764, + "rewards//mean": 0.7572021484375, + "rewards//std": 0.02254435420036316, + "step": 2226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4454, + "grad_norm": 5.670596599578857, + "kl": 1.3016874808818102, + "learning_rate": 5.943140051655827e-07, + "loss": 0.1302, + "num_tokens": 19248752.0, + "reward": 0.78106689453125, + "reward_std": 0.008027156814932823, + "rewards//mean": 0.78106689453125, + "rewards//std": 0.041430167853832245, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4456, + "grad_norm": 2.760960102081299, + "kl": 1.4120997283607721, + "learning_rate": 5.940023501612902e-07, + "loss": 0.1412, + "num_tokens": 19257360.0, + "reward": 0.7879638671875, + "reward_std": 0.005790394730865955, + "rewards//mean": 0.7879638671875, + "rewards//std": 0.026716234162449837, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4458, + "grad_norm": 4.535856246948242, + "kl": 1.346991265192628, + "learning_rate": 5.936906572928624e-07, + "loss": 0.1347, + "num_tokens": 19265928.0, + "reward": 0.76690673828125, + "reward_std": 0.00960371270775795, + "rewards//mean": 0.76690673828125, + "rewards//std": 0.0207524336874485, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.446, + "grad_norm": 4.852396011352539, + "kl": 1.7564903423190117, + "learning_rate": 5.933789266858488e-07, + "loss": 0.1756, + "num_tokens": 19274608.0, + "reward": 0.73443603515625, + "reward_std": 0.009558469988405704, + "rewards//mean": 0.73443603515625, + "rewards//std": 0.03668675944209099, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4462, + "grad_norm": 2.425013303756714, + "kl": 0.9003709629178047, + "learning_rate": 5.93067158465815e-07, + "loss": 0.09, + "num_tokens": 19283232.0, + "reward": 0.7635498046875, + "reward_std": 0.005046244245022535, + "rewards//mean": 0.7635498046875, + "rewards//std": 0.02313554659485817, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4464, + "grad_norm": 2.3701603412628174, + "kl": 1.7575933411717415, + "learning_rate": 5.927553527583407e-07, + "loss": 0.1758, + "num_tokens": 19291856.0, + "reward": 0.7344970703125, + "reward_std": 0.011038710363209248, + "rewards//mean": 0.7344970703125, + "rewards//std": 0.02420983649790287, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4466, + "grad_norm": 3.427142858505249, + "kl": 2.197600083425641, + "learning_rate": 5.924435096890216e-07, + "loss": 0.2198, + "num_tokens": 19300440.0, + "reward": 0.7344970703125, + "reward_std": 0.014010196551680565, + "rewards//mean": 0.7344970703125, + "rewards//std": 0.033276673406362534, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4468, + "grad_norm": 3.0308609008789062, + "kl": 1.918210320174694, + "learning_rate": 5.921316293834676e-07, + "loss": 0.1918, + "num_tokens": 19309072.0, + "reward": 0.72552490234375, + "reward_std": 0.011504017747938633, + "rewards//mean": 0.72552490234375, + "rewards//std": 0.02771102450788021, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.447, + "grad_norm": 4.685697078704834, + "kl": 1.05201830342412, + "learning_rate": 5.918197119673046e-07, + "loss": 0.1052, + "num_tokens": 19317760.0, + "reward": 0.73870849609375, + "reward_std": 0.007364805322140455, + "rewards//mean": 0.73870849609375, + "rewards//std": 0.02881542406976223, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4472, + "grad_norm": 3.32118558883667, + "kl": 1.2251533586531878, + "learning_rate": 5.915077575661722e-07, + "loss": 0.1225, + "num_tokens": 19326376.0, + "reward": 0.74700927734375, + "reward_std": 0.009418869391083717, + "rewards//mean": 0.74700927734375, + "rewards//std": 0.025683382526040077, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4474, + "grad_norm": 4.142004489898682, + "kl": 1.7761543840169907, + "learning_rate": 5.911957663057263e-07, + "loss": 0.1776, + "num_tokens": 19335152.0, + "reward": 0.77783203125, + "reward_std": 0.006641753017902374, + "rewards//mean": 0.77783203125, + "rewards//std": 0.03624178469181061, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4476, + "grad_norm": 4.638620376586914, + "kl": 1.7298043835908175, + "learning_rate": 5.908837383116367e-07, + "loss": 0.173, + "num_tokens": 19343960.0, + "reward": 0.73443603515625, + "reward_std": 0.01136423647403717, + "rewards//mean": 0.73443603515625, + "rewards//std": 0.03913988173007965, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4478, + "grad_norm": 9.001989364624023, + "kl": 0.9885972570627928, + "learning_rate": 5.905716737095879e-07, + "loss": 0.0989, + "num_tokens": 19352728.0, + "reward": 0.78692626953125, + "reward_std": 0.00855414941906929, + "rewards//mean": 0.78692626953125, + "rewards//std": 0.029560690745711327, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.448, + "grad_norm": 4.2129621505737305, + "kl": 2.3857249096035957, + "learning_rate": 5.9025957262528e-07, + "loss": 0.2386, + "num_tokens": 19361312.0, + "reward": 0.76300048828125, + "reward_std": 0.015018035657703876, + "rewards//mean": 0.76300048828125, + "rewards//std": 0.03025011532008648, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4482, + "grad_norm": 5.144629955291748, + "kl": 1.4536605570465326, + "learning_rate": 5.899474351844269e-07, + "loss": 0.1454, + "num_tokens": 19370032.0, + "reward": 0.7764892578125, + "reward_std": 0.012827062048017979, + "rewards//mean": 0.7764892578125, + "rewards//std": 0.03283337503671646, + "step": 2241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4484, + "grad_norm": 4.99237060546875, + "kl": 2.86059533059597, + "learning_rate": 5.896352615127578e-07, + "loss": 0.2861, + "num_tokens": 19378760.0, + "reward": 0.7662353515625, + "reward_std": 0.017454706132411957, + "rewards//mean": 0.7662353515625, + "rewards//std": 0.03660564124584198, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4486, + "grad_norm": 3.0607526302337646, + "kl": 1.372886810451746, + "learning_rate": 5.893230517360159e-07, + "loss": 0.1373, + "num_tokens": 19387488.0, + "reward": 0.77734375, + "reward_std": 0.008747629821300507, + "rewards//mean": 0.77734375, + "rewards//std": 0.023313162848353386, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4488, + "grad_norm": 5.0679192543029785, + "kl": 1.4068742860108614, + "learning_rate": 5.890108059799595e-07, + "loss": 0.1407, + "num_tokens": 19396104.0, + "reward": 0.74285888671875, + "reward_std": 0.011632399633526802, + "rewards//mean": 0.74285888671875, + "rewards//std": 0.02476947195827961, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.449, + "grad_norm": 4.523594379425049, + "kl": 1.4557523764669895, + "learning_rate": 5.886985243703611e-07, + "loss": 0.1456, + "num_tokens": 19404688.0, + "reward": 0.79681396484375, + "reward_std": 0.010571167804300785, + "rewards//mean": 0.79681396484375, + "rewards//std": 0.027748145163059235, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4492, + "grad_norm": 3.1926369667053223, + "kl": 1.2903675809502602, + "learning_rate": 5.883862070330078e-07, + "loss": 0.129, + "num_tokens": 19413216.0, + "reward": 0.7322998046875, + "reward_std": 0.009928472340106964, + "rewards//mean": 0.7322998046875, + "rewards//std": 0.030265437439084053, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4494, + "grad_norm": 3.0452635288238525, + "kl": 1.8456795923411846, + "learning_rate": 5.880738540937007e-07, + "loss": 0.1846, + "num_tokens": 19421816.0, + "reward": 0.75506591796875, + "reward_std": 0.009103763848543167, + "rewards//mean": 0.75506591796875, + "rewards//std": 0.03393048793077469, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4496, + "grad_norm": 19.630908966064453, + "kl": 2.499043306335807, + "learning_rate": 5.877614656782559e-07, + "loss": 0.2499, + "num_tokens": 19430480.0, + "reward": 0.73516845703125, + "reward_std": 0.01679658144712448, + "rewards//mean": 0.73516845703125, + "rewards//std": 0.03420420363545418, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4498, + "grad_norm": 2.1093199253082275, + "kl": 1.6102879792451859, + "learning_rate": 5.874490419125032e-07, + "loss": 0.161, + "num_tokens": 19439184.0, + "reward": 0.76776123046875, + "reward_std": 0.01153447013348341, + "rewards//mean": 0.76776123046875, + "rewards//std": 0.01978297345340252, + "step": 2249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.45, + "grad_norm": 7.4742631912231445, + "kl": 2.9134430456906557, + "learning_rate": 5.871365829222868e-07, + "loss": 0.2913, + "num_tokens": 19447808.0, + "reward": 0.763671875, + "reward_std": 0.022339336574077606, + "rewards//mean": 0.763671875, + "rewards//std": 0.03964105248451233, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4502, + "grad_norm": 11.14273738861084, + "kl": 2.5575021505355835, + "learning_rate": 5.868240888334652e-07, + "loss": 0.2558, + "num_tokens": 19456568.0, + "reward": 0.759033203125, + "reward_std": 0.013632156886160374, + "rewards//mean": 0.759033203125, + "rewards//std": 0.04548979923129082, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4504, + "grad_norm": 13.798040390014648, + "kl": 2.2919829171150923, + "learning_rate": 5.86511559771911e-07, + "loss": 0.2292, + "num_tokens": 19465152.0, + "reward": 0.78948974609375, + "reward_std": 0.009421843104064465, + "rewards//mean": 0.78948974609375, + "rewards//std": 0.028392048552632332, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4506, + "grad_norm": 6.453871250152588, + "kl": 0.6344418078660965, + "learning_rate": 5.861989958635109e-07, + "loss": 0.0634, + "num_tokens": 19473784.0, + "reward": 0.7613525390625, + "reward_std": 0.003452669596299529, + "rewards//mean": 0.7613525390625, + "rewards//std": 0.025091663002967834, + "step": 2253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4508, + "grad_norm": 2.3491311073303223, + "kl": 1.2700868621468544, + "learning_rate": 5.858863972341655e-07, + "loss": 0.127, + "num_tokens": 19482384.0, + "reward": 0.734619140625, + "reward_std": 0.008741095662117004, + "rewards//mean": 0.734619140625, + "rewards//std": 0.03888772428035736, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.451, + "grad_norm": 2.915720224380493, + "kl": 1.6155210081487894, + "learning_rate": 5.855737640097897e-07, + "loss": 0.1616, + "num_tokens": 19490976.0, + "reward": 0.74755859375, + "reward_std": 0.012390751391649246, + "rewards//mean": 0.74755859375, + "rewards//std": 0.027982894331216812, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4512, + "grad_norm": 3.2181129455566406, + "kl": 1.1250402592122555, + "learning_rate": 5.852610963163119e-07, + "loss": 0.1125, + "num_tokens": 19499600.0, + "reward": 0.78741455078125, + "reward_std": 0.0062089161947369576, + "rewards//mean": 0.78741455078125, + "rewards//std": 0.018358975648880005, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4514, + "grad_norm": 6.91610050201416, + "kl": 1.4435250479727983, + "learning_rate": 5.849483942796747e-07, + "loss": 0.1444, + "num_tokens": 19508192.0, + "reward": 0.75537109375, + "reward_std": 0.006165365222841501, + "rewards//mean": 0.75537109375, + "rewards//std": 0.04654053598642349, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4516, + "grad_norm": 5.179140567779541, + "kl": 1.892649443820119, + "learning_rate": 5.846356580258345e-07, + "loss": 0.1893, + "num_tokens": 19516856.0, + "reward": 0.74609375, + "reward_std": 0.008804559707641602, + "rewards//mean": 0.74609375, + "rewards//std": 0.02936396934092045, + "step": 2258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4518, + "grad_norm": 4.5168962478637695, + "kl": 1.6130703128874302, + "learning_rate": 5.843228876807613e-07, + "loss": 0.1613, + "num_tokens": 19525392.0, + "reward": 0.7567138671875, + "reward_std": 0.012764401733875275, + "rewards//mean": 0.7567138671875, + "rewards//std": 0.0328776054084301, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.452, + "grad_norm": 3.725329875946045, + "kl": 1.3526629097759724, + "learning_rate": 5.840100833704391e-07, + "loss": 0.1353, + "num_tokens": 19534040.0, + "reward": 0.78155517578125, + "reward_std": 0.008189897984266281, + "rewards//mean": 0.78155517578125, + "rewards//std": 0.02445208840072155, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4522, + "grad_norm": 2.4616634845733643, + "kl": 1.0076962783932686, + "learning_rate": 5.836972452208654e-07, + "loss": 0.1008, + "num_tokens": 19542632.0, + "reward": 0.7542724609375, + "reward_std": 0.007578426506370306, + "rewards//mean": 0.7542724609375, + "rewards//std": 0.028634998947381973, + "step": 2261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4524, + "grad_norm": 3.1592252254486084, + "kl": 1.5775563437491655, + "learning_rate": 5.833843733580512e-07, + "loss": 0.1578, + "num_tokens": 19551264.0, + "reward": 0.73858642578125, + "reward_std": 0.007352606393396854, + "rewards//mean": 0.73858642578125, + "rewards//std": 0.024629736319184303, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4526, + "grad_norm": 5.9291605949401855, + "kl": 1.993557408452034, + "learning_rate": 5.830714679080215e-07, + "loss": 0.1994, + "num_tokens": 19559848.0, + "reward": 0.7352294921875, + "reward_std": 0.014678683131933212, + "rewards//mean": 0.7352294921875, + "rewards//std": 0.041651953011751175, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4528, + "grad_norm": 4.6405439376831055, + "kl": 0.9084633458405733, + "learning_rate": 5.827585289968142e-07, + "loss": 0.0908, + "num_tokens": 19568432.0, + "reward": 0.77325439453125, + "reward_std": 0.005379711743444204, + "rewards//mean": 0.77325439453125, + "rewards//std": 0.029433418065309525, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.453, + "grad_norm": 4.013720512390137, + "kl": 1.3415140621364117, + "learning_rate": 5.824455567504817e-07, + "loss": 0.1342, + "num_tokens": 19577152.0, + "reward": 0.76556396484375, + "reward_std": 0.011306493543088436, + "rewards//mean": 0.76556396484375, + "rewards//std": 0.026064835488796234, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4532, + "grad_norm": 5.630840301513672, + "kl": 1.7550586387515068, + "learning_rate": 5.821325512950885e-07, + "loss": 0.1755, + "num_tokens": 19585824.0, + "reward": 0.766845703125, + "reward_std": 0.015313655138015747, + "rewards//mean": 0.766845703125, + "rewards//std": 0.040968358516693115, + "step": 2266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4534, + "grad_norm": 4.251613616943359, + "kl": 1.480274161323905, + "learning_rate": 5.818195127567135e-07, + "loss": 0.148, + "num_tokens": 19594456.0, + "reward": 0.7344970703125, + "reward_std": 0.013099726289510727, + "rewards//mean": 0.7344970703125, + "rewards//std": 0.03744783625006676, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4536, + "grad_norm": 3.591388702392578, + "kl": 1.122734671458602, + "learning_rate": 5.815064412614486e-07, + "loss": 0.1123, + "num_tokens": 19603184.0, + "reward": 0.7447509765625, + "reward_std": 0.006167711224406958, + "rewards//mean": 0.7447509765625, + "rewards//std": 0.03296405076980591, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4538, + "grad_norm": 5.940333366394043, + "kl": 1.7895222436636686, + "learning_rate": 5.81193336935399e-07, + "loss": 0.179, + "num_tokens": 19611816.0, + "reward": 0.7520751953125, + "reward_std": 0.0103962616994977, + "rewards//mean": 0.7520751953125, + "rewards//std": 0.031273003667593, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.454, + "grad_norm": 2.5307295322418213, + "kl": 1.2336862310767174, + "learning_rate": 5.808801999046829e-07, + "loss": 0.1234, + "num_tokens": 19620504.0, + "reward": 0.7423095703125, + "reward_std": 0.008760766126215458, + "rewards//mean": 0.7423095703125, + "rewards//std": 0.03391827270388603, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4542, + "grad_norm": 13.41973876953125, + "kl": 2.3068683687597513, + "learning_rate": 5.805670302954321e-07, + "loss": 0.2307, + "num_tokens": 19629192.0, + "reward": 0.72930908203125, + "reward_std": 0.012938005849719048, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.03788963332772255, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4544, + "grad_norm": 1.6860803365707397, + "kl": 0.8884401004761457, + "learning_rate": 5.802538282337909e-07, + "loss": 0.0888, + "num_tokens": 19637832.0, + "reward": 0.7449951171875, + "reward_std": 0.007689398247748613, + "rewards//mean": 0.7449951171875, + "rewards//std": 0.01999654993414879, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4546, + "grad_norm": 5.04882287979126, + "kl": 1.680007193237543, + "learning_rate": 5.799405938459174e-07, + "loss": 0.168, + "num_tokens": 19646472.0, + "reward": 0.76422119140625, + "reward_std": 0.007334953173995018, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.03278094530105591, + "step": 2273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4548, + "grad_norm": 5.619682788848877, + "kl": 1.5738869309425354, + "learning_rate": 5.796273272579823e-07, + "loss": 0.1574, + "num_tokens": 19655080.0, + "reward": 0.7183837890625, + "reward_std": 0.014580807648599148, + "rewards//mean": 0.7183837890625, + "rewards//std": 0.03793460503220558, + "step": 2274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.455, + "grad_norm": 3.418785572052002, + "kl": 1.935205116868019, + "learning_rate": 5.793140285961692e-07, + "loss": 0.1935, + "num_tokens": 19663912.0, + "reward": 0.76513671875, + "reward_std": 0.01670861430466175, + "rewards//mean": 0.76513671875, + "rewards//std": 0.034635696560144424, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4552, + "grad_norm": 2.6632421016693115, + "kl": 0.9768502432852983, + "learning_rate": 5.79000697986675e-07, + "loss": 0.0977, + "num_tokens": 19672512.0, + "reward": 0.75347900390625, + "reward_std": 0.006700205150991678, + "rewards//mean": 0.75347900390625, + "rewards//std": 0.031025337055325508, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4554, + "grad_norm": 2.0885069370269775, + "kl": 1.0108186583966017, + "learning_rate": 5.78687335555709e-07, + "loss": 0.1011, + "num_tokens": 19681216.0, + "reward": 0.760498046875, + "reward_std": 0.005556948017328978, + "rewards//mean": 0.760498046875, + "rewards//std": 0.026747096329927444, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4556, + "grad_norm": 2.082794666290283, + "kl": 0.7659388724714518, + "learning_rate": 5.783739414294937e-07, + "loss": 0.0766, + "num_tokens": 19689840.0, + "reward": 0.7401123046875, + "reward_std": 0.0030591851100325584, + "rewards//mean": 0.7401123046875, + "rewards//std": 0.03374645859003067, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4558, + "grad_norm": 2.734874725341797, + "kl": 1.0409807413816452, + "learning_rate": 5.780605157342641e-07, + "loss": 0.1041, + "num_tokens": 19698464.0, + "reward": 0.76141357421875, + "reward_std": 0.009999404661357403, + "rewards//mean": 0.76141357421875, + "rewards//std": 0.030027108266949654, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.456, + "grad_norm": 4.524035930633545, + "kl": 1.1656199656426907, + "learning_rate": 5.777470585962681e-07, + "loss": 0.1166, + "num_tokens": 19707088.0, + "reward": 0.73291015625, + "reward_std": 0.006849617697298527, + "rewards//mean": 0.73291015625, + "rewards//std": 0.025195516645908356, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4562, + "grad_norm": 0.24029770493507385, + "kl": 0.43820466473698616, + "learning_rate": 5.774335701417662e-07, + "loss": 0.0438, + "num_tokens": 19715640.0, + "reward": 0.78314208984375, + "reward_std": 0.0005179004510864615, + "rewards//mean": 0.78314208984375, + "rewards//std": 0.03393985703587532, + "step": 2281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4564, + "grad_norm": 2.8862388134002686, + "kl": 1.4203681889921427, + "learning_rate": 5.771200504970315e-07, + "loss": 0.142, + "num_tokens": 19724360.0, + "reward": 0.80975341796875, + "reward_std": 0.012539228424429893, + "rewards//mean": 0.80975341796875, + "rewards//std": 0.03106873109936714, + "step": 2282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4566, + "grad_norm": 8.379032135009766, + "kl": 1.5679272580891848, + "learning_rate": 5.768064997883498e-07, + "loss": 0.1568, + "num_tokens": 19732928.0, + "reward": 0.75146484375, + "reward_std": 0.010234023444354534, + "rewards//mean": 0.75146484375, + "rewards//std": 0.030746029689908028, + "step": 2283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4568, + "grad_norm": 3.7703654766082764, + "kl": 1.4771138653159142, + "learning_rate": 5.764929181420191e-07, + "loss": 0.1477, + "num_tokens": 19741496.0, + "reward": 0.7711181640625, + "reward_std": 0.01939561776816845, + "rewards//mean": 0.7711181640625, + "rewards//std": 0.03047080710530281, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.457, + "grad_norm": 2.825913429260254, + "kl": 1.6202249489724636, + "learning_rate": 5.7617930568435e-07, + "loss": 0.162, + "num_tokens": 19750192.0, + "reward": 0.75677490234375, + "reward_std": 0.010470375418663025, + "rewards//mean": 0.75677490234375, + "rewards//std": 0.03360458090901375, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4572, + "grad_norm": 1.3622242212295532, + "kl": 1.71459929831326, + "learning_rate": 5.758656625416658e-07, + "loss": 0.1715, + "num_tokens": 19758880.0, + "reward": 0.7625732421875, + "reward_std": 0.00901164673268795, + "rewards//mean": 0.7625732421875, + "rewards//std": 0.034955672919750214, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4574, + "grad_norm": 6.915200233459473, + "kl": 1.5001412760466337, + "learning_rate": 5.755519888403017e-07, + "loss": 0.15, + "num_tokens": 19767504.0, + "reward": 0.72113037109375, + "reward_std": 0.003564273938536644, + "rewards//mean": 0.72113037109375, + "rewards//std": 0.03800412267446518, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4576, + "grad_norm": 5.2832350730896, + "kl": 1.0978982914239168, + "learning_rate": 5.752382847066058e-07, + "loss": 0.1098, + "num_tokens": 19776120.0, + "reward": 0.77276611328125, + "reward_std": 0.010398730635643005, + "rewards//mean": 0.77276611328125, + "rewards//std": 0.02893652208149433, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4578, + "grad_norm": 2.298997402191162, + "kl": 1.235537650063634, + "learning_rate": 5.749245502669375e-07, + "loss": 0.1236, + "num_tokens": 19784808.0, + "reward": 0.7828369140625, + "reward_std": 0.010551662184298038, + "rewards//mean": 0.7828369140625, + "rewards//std": 0.028414232656359673, + "step": 2289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.458, + "grad_norm": 2.1068131923675537, + "kl": 1.8870402574539185, + "learning_rate": 5.746107856476694e-07, + "loss": 0.1887, + "num_tokens": 19793496.0, + "reward": 0.7496337890625, + "reward_std": 0.011930773966014385, + "rewards//mean": 0.7496337890625, + "rewards//std": 0.03635667636990547, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4582, + "grad_norm": 3.6472511291503906, + "kl": 1.5486475992947817, + "learning_rate": 5.742969909751858e-07, + "loss": 0.1549, + "num_tokens": 19802080.0, + "reward": 0.7484130859375, + "reward_std": 0.011839975602924824, + "rewards//mean": 0.7484130859375, + "rewards//std": 0.032650288194417953, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4584, + "grad_norm": 6.595364570617676, + "kl": 1.865392193198204, + "learning_rate": 5.739831663758833e-07, + "loss": 0.1865, + "num_tokens": 19810720.0, + "reward": 0.714111328125, + "reward_std": 0.008248982951045036, + "rewards//mean": 0.714111328125, + "rewards//std": 0.04612429440021515, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4586, + "grad_norm": 4.3111491203308105, + "kl": 2.206091396510601, + "learning_rate": 5.7366931197617e-07, + "loss": 0.2206, + "num_tokens": 19819376.0, + "reward": 0.75244140625, + "reward_std": 0.017201296985149384, + "rewards//mean": 0.75244140625, + "rewards//std": 0.04012385010719299, + "step": 2293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4588, + "grad_norm": 2.750823497772217, + "kl": 1.9352333936840296, + "learning_rate": 5.733554279024667e-07, + "loss": 0.1935, + "num_tokens": 19828088.0, + "reward": 0.77386474609375, + "reward_std": 0.012679493054747581, + "rewards//mean": 0.77386474609375, + "rewards//std": 0.03327047452330589, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.459, + "grad_norm": 4.1313652992248535, + "kl": 1.0644458197057247, + "learning_rate": 5.730415142812058e-07, + "loss": 0.1064, + "num_tokens": 19836696.0, + "reward": 0.70172119140625, + "reward_std": 0.007787439972162247, + "rewards//mean": 0.70172119140625, + "rewards//std": 0.03500343859195709, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 127.21875, + "epoch": 0.4592, + "grad_norm": 2.8158137798309326, + "kl": 1.3696059808135033, + "learning_rate": 5.727275712388317e-07, + "loss": 0.1334, + "num_tokens": 19845254.0, + "reward": 0.73834228515625, + "reward_std": 0.014739073812961578, + "rewards//mean": 0.73834228515625, + "rewards//std": 0.04090100899338722, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4594, + "grad_norm": 1.4619685411453247, + "kl": 1.0665322300046682, + "learning_rate": 5.724135989018006e-07, + "loss": 0.1067, + "num_tokens": 19853894.0, + "reward": 0.78631591796875, + "reward_std": 0.005902774166315794, + "rewards//mean": 0.78631591796875, + "rewards//std": 0.030094586312770844, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4596, + "grad_norm": 5.505870819091797, + "kl": 1.600073168054223, + "learning_rate": 5.720995973965805e-07, + "loss": 0.16, + "num_tokens": 19862542.0, + "reward": 0.77166748046875, + "reward_std": 0.0137926135212183, + "rewards//mean": 0.77166748046875, + "rewards//std": 0.02982172742486, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4598, + "grad_norm": 3.689547300338745, + "kl": 0.8799671772867441, + "learning_rate": 5.717855668496513e-07, + "loss": 0.088, + "num_tokens": 19871198.0, + "reward": 0.74993896484375, + "reward_std": 0.008864214643836021, + "rewards//mean": 0.74993896484375, + "rewards//std": 0.027835296466946602, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.46, + "grad_norm": 3.558633327484131, + "kl": 1.4338841624557972, + "learning_rate": 5.714715073875043e-07, + "loss": 0.1434, + "num_tokens": 19879830.0, + "reward": 0.74383544921875, + "reward_std": 0.013754835352301598, + "rewards//mean": 0.74383544921875, + "rewards//std": 0.03137225657701492, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4602, + "grad_norm": 2.6467478275299072, + "kl": 1.0575682818889618, + "learning_rate": 5.711574191366427e-07, + "loss": 0.1058, + "num_tokens": 19888430.0, + "reward": 0.75311279296875, + "reward_std": 0.009000520221889019, + "rewards//mean": 0.75311279296875, + "rewards//std": 0.02298020012676716, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4604, + "grad_norm": 4.254924297332764, + "kl": 1.149335527792573, + "learning_rate": 5.70843302223581e-07, + "loss": 0.1149, + "num_tokens": 19897126.0, + "reward": 0.77099609375, + "reward_std": 0.014753825962543488, + "rewards//mean": 0.77099609375, + "rewards//std": 0.027809247374534607, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4606, + "grad_norm": 3.3665614128112793, + "kl": 1.1195424757897854, + "learning_rate": 5.705291567748458e-07, + "loss": 0.112, + "num_tokens": 19905758.0, + "reward": 0.7655029296875, + "reward_std": 0.005668755155056715, + "rewards//mean": 0.7655029296875, + "rewards//std": 0.03225661441683769, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4608, + "grad_norm": 7.425473690032959, + "kl": 1.5551208686083555, + "learning_rate": 5.702149829169746e-07, + "loss": 0.1555, + "num_tokens": 19914382.0, + "reward": 0.77947998046875, + "reward_std": 0.015478307381272316, + "rewards//mean": 0.77947998046875, + "rewards//std": 0.029585260897874832, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.461, + "grad_norm": 2.5418999195098877, + "kl": 2.107375605031848, + "learning_rate": 5.699007807765168e-07, + "loss": 0.2107, + "num_tokens": 19922926.0, + "reward": 0.76983642578125, + "reward_std": 0.015244618058204651, + "rewards//mean": 0.76983642578125, + "rewards//std": 0.027972545474767685, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4612, + "grad_norm": 16.62290382385254, + "kl": 2.8820286486297846, + "learning_rate": 5.695865504800327e-07, + "loss": 0.2882, + "num_tokens": 19931582.0, + "reward": 0.73858642578125, + "reward_std": 0.008437427692115307, + "rewards//mean": 0.73858642578125, + "rewards//std": 0.03225162625312805, + "step": 2306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4614, + "grad_norm": 5.047966480255127, + "kl": 2.3959045447409153, + "learning_rate": 5.692722921540945e-07, + "loss": 0.2396, + "num_tokens": 19940342.0, + "reward": 0.76123046875, + "reward_std": 0.016889663413167, + "rewards//mean": 0.76123046875, + "rewards//std": 0.04139548912644386, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4616, + "grad_norm": 10.113832473754883, + "kl": 2.460652766749263, + "learning_rate": 5.689580059252852e-07, + "loss": 0.2461, + "num_tokens": 19948934.0, + "reward": 0.767822265625, + "reward_std": 0.013643529266119003, + "rewards//mean": 0.767822265625, + "rewards//std": 0.029813161119818687, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4618, + "grad_norm": 2.8222296237945557, + "kl": 1.654306299984455, + "learning_rate": 5.686436919201996e-07, + "loss": 0.1654, + "num_tokens": 19957590.0, + "reward": 0.724853515625, + "reward_std": 0.009500568732619286, + "rewards//mean": 0.724853515625, + "rewards//std": 0.0390431210398674, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.462, + "grad_norm": 8.957334518432617, + "kl": 2.0935809295624495, + "learning_rate": 5.683293502654428e-07, + "loss": 0.2094, + "num_tokens": 19966190.0, + "reward": 0.728759765625, + "reward_std": 0.009076015092432499, + "rewards//mean": 0.728759765625, + "rewards//std": 0.030208613723516464, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4622, + "grad_norm": 8.834135055541992, + "kl": 2.0918173901736736, + "learning_rate": 5.680149810876322e-07, + "loss": 0.2092, + "num_tokens": 19974838.0, + "reward": 0.72412109375, + "reward_std": 0.010887807235121727, + "rewards//mean": 0.72412109375, + "rewards//std": 0.03261898085474968, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4624, + "grad_norm": 5.16759729385376, + "kl": 1.57982724532485, + "learning_rate": 5.677005845133951e-07, + "loss": 0.158, + "num_tokens": 19983462.0, + "reward": 0.7791748046875, + "reward_std": 0.013439645990729332, + "rewards//mean": 0.7791748046875, + "rewards//std": 0.02958557941019535, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4626, + "grad_norm": 7.889267921447754, + "kl": 1.765737121924758, + "learning_rate": 5.673861606693707e-07, + "loss": 0.1766, + "num_tokens": 19992062.0, + "reward": 0.7662353515625, + "reward_std": 0.009925240650773048, + "rewards//mean": 0.7662353515625, + "rewards//std": 0.031220680102705956, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4628, + "grad_norm": 9.213332176208496, + "kl": 2.858308671042323, + "learning_rate": 5.670717096822088e-07, + "loss": 0.2858, + "num_tokens": 20000670.0, + "reward": 0.7562255859375, + "reward_std": 0.019939731806516647, + "rewards//mean": 0.7562255859375, + "rewards//std": 0.03856779262423515, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.463, + "grad_norm": 9.588549613952637, + "kl": 2.275641893967986, + "learning_rate": 5.667572316785705e-07, + "loss": 0.2276, + "num_tokens": 20009382.0, + "reward": 0.7445068359375, + "reward_std": 0.011462279595434666, + "rewards//mean": 0.7445068359375, + "rewards//std": 0.032224684953689575, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4632, + "grad_norm": 2.9496564865112305, + "kl": 1.0349012799561024, + "learning_rate": 5.664427267851271e-07, + "loss": 0.1035, + "num_tokens": 20017990.0, + "reward": 0.73046875, + "reward_std": 0.010286872275173664, + "rewards//mean": 0.73046875, + "rewards//std": 0.03610452264547348, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4634, + "grad_norm": 9.274938583374023, + "kl": 1.81477808393538, + "learning_rate": 5.661281951285612e-07, + "loss": 0.1815, + "num_tokens": 20026638.0, + "reward": 0.784912109375, + "reward_std": 0.01661362126469612, + "rewards//mean": 0.784912109375, + "rewards//std": 0.04215714707970619, + "step": 2317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4636, + "grad_norm": 5.335949420928955, + "kl": 1.8578124716877937, + "learning_rate": 5.658136368355664e-07, + "loss": 0.1858, + "num_tokens": 20035294.0, + "reward": 0.74322509765625, + "reward_std": 0.018399160355329514, + "rewards//mean": 0.74322509765625, + "rewards//std": 0.03870898112654686, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4638, + "grad_norm": 3.8823208808898926, + "kl": 1.5303014554083347, + "learning_rate": 5.654990520328464e-07, + "loss": 0.153, + "num_tokens": 20043990.0, + "reward": 0.72210693359375, + "reward_std": 0.015086507424712181, + "rewards//mean": 0.72210693359375, + "rewards//std": 0.03736948221921921, + "step": 2319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.464, + "grad_norm": 3.0423474311828613, + "kl": 0.9793323148041964, + "learning_rate": 5.651844408471162e-07, + "loss": 0.0979, + "num_tokens": 20052646.0, + "reward": 0.76800537109375, + "reward_std": 0.009244940243661404, + "rewards//mean": 0.76800537109375, + "rewards//std": 0.03477608785033226, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4642, + "grad_norm": 5.1409525871276855, + "kl": 1.4399462137371302, + "learning_rate": 5.648698034051008e-07, + "loss": 0.144, + "num_tokens": 20061326.0, + "reward": 0.76251220703125, + "reward_std": 0.008960804902017117, + "rewards//mean": 0.76251220703125, + "rewards//std": 0.03300048038363457, + "step": 2321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4644, + "grad_norm": 6.581009864807129, + "kl": 1.4599250629544258, + "learning_rate": 5.645551398335366e-07, + "loss": 0.146, + "num_tokens": 20070030.0, + "reward": 0.75909423828125, + "reward_std": 0.01530779991298914, + "rewards//mean": 0.75909423828125, + "rewards//std": 0.040042608976364136, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4646, + "grad_norm": 4.674483776092529, + "kl": 1.5661939792335033, + "learning_rate": 5.642404502591697e-07, + "loss": 0.1566, + "num_tokens": 20078638.0, + "reward": 0.758544921875, + "reward_std": 0.013586866669356823, + "rewards//mean": 0.758544921875, + "rewards//std": 0.039302803575992584, + "step": 2323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4648, + "grad_norm": 3.5567595958709717, + "kl": 0.8441638015210629, + "learning_rate": 5.639257348087572e-07, + "loss": 0.0844, + "num_tokens": 20087238.0, + "reward": 0.7611083984375, + "reward_std": 0.009858135133981705, + "rewards//mean": 0.7611083984375, + "rewards//std": 0.04005569592118263, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.465, + "grad_norm": 4.788660049438477, + "kl": 1.6492008958011866, + "learning_rate": 5.636109936090661e-07, + "loss": 0.1649, + "num_tokens": 20095870.0, + "reward": 0.72271728515625, + "reward_std": 0.011165713891386986, + "rewards//mean": 0.72271728515625, + "rewards//std": 0.02750265784561634, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4652, + "grad_norm": 5.559743881225586, + "kl": 2.147373478859663, + "learning_rate": 5.632962267868746e-07, + "loss": 0.2147, + "num_tokens": 20104526.0, + "reward": 0.75775146484375, + "reward_std": 0.014038216322660446, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.03556034341454506, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4654, + "grad_norm": 4.905421733856201, + "kl": 1.0318863317370415, + "learning_rate": 5.629814344689705e-07, + "loss": 0.1032, + "num_tokens": 20113086.0, + "reward": 0.75439453125, + "reward_std": 0.009321057237684727, + "rewards//mean": 0.75439453125, + "rewards//std": 0.02076929435133934, + "step": 2327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4656, + "grad_norm": 1.7553902864456177, + "kl": 0.930904246866703, + "learning_rate": 5.626666167821521e-07, + "loss": 0.0931, + "num_tokens": 20121814.0, + "reward": 0.74993896484375, + "reward_std": 0.0059691788628697395, + "rewards//mean": 0.74993896484375, + "rewards//std": 0.03247007727622986, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4658, + "grad_norm": 3.464634895324707, + "kl": 0.9578096698969603, + "learning_rate": 5.623517738532279e-07, + "loss": 0.0958, + "num_tokens": 20130454.0, + "reward": 0.7445068359375, + "reward_std": 0.008606933057308197, + "rewards//mean": 0.7445068359375, + "rewards//std": 0.03139474615454674, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.466, + "grad_norm": 2.9122116565704346, + "kl": 1.6289369370788336, + "learning_rate": 5.620369058090168e-07, + "loss": 0.1629, + "num_tokens": 20139062.0, + "reward": 0.777099609375, + "reward_std": 0.011115273460745811, + "rewards//mean": 0.777099609375, + "rewards//std": 0.025975316762924194, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4662, + "grad_norm": 4.663967609405518, + "kl": 1.2057207133620977, + "learning_rate": 5.617220127763474e-07, + "loss": 0.1206, + "num_tokens": 20147694.0, + "reward": 0.75347900390625, + "reward_std": 0.013228155672550201, + "rewards//mean": 0.75347900390625, + "rewards//std": 0.03711748123168945, + "step": 2331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4664, + "grad_norm": 6.392350673675537, + "kl": 0.8989686723798513, + "learning_rate": 5.614070948820585e-07, + "loss": 0.0899, + "num_tokens": 20156334.0, + "reward": 0.75225830078125, + "reward_std": 0.004620090592652559, + "rewards//mean": 0.75225830078125, + "rewards//std": 0.026289785280823708, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4666, + "grad_norm": 5.158771991729736, + "kl": 1.4124057721346617, + "learning_rate": 5.610921522529993e-07, + "loss": 0.1412, + "num_tokens": 20164910.0, + "reward": 0.73828125, + "reward_std": 0.010326679795980453, + "rewards//mean": 0.73828125, + "rewards//std": 0.023114927113056183, + "step": 2333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4668, + "grad_norm": 2.983321189880371, + "kl": 1.3028583489358425, + "learning_rate": 5.607771850160284e-07, + "loss": 0.1303, + "num_tokens": 20173638.0, + "reward": 0.77581787109375, + "reward_std": 0.0092705637216568, + "rewards//mean": 0.77581787109375, + "rewards//std": 0.03195219114422798, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.467, + "grad_norm": 4.613224506378174, + "kl": 0.9228640850633383, + "learning_rate": 5.604621932980147e-07, + "loss": 0.0923, + "num_tokens": 20182278.0, + "reward": 0.74420166015625, + "reward_std": 0.007500991225242615, + "rewards//mean": 0.74420166015625, + "rewards//std": 0.02901540882885456, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4672, + "grad_norm": 6.171602249145508, + "kl": 0.7219446841627359, + "learning_rate": 5.601471772258367e-07, + "loss": 0.0722, + "num_tokens": 20190862.0, + "reward": 0.74676513671875, + "reward_std": 0.00646547507494688, + "rewards//mean": 0.74676513671875, + "rewards//std": 0.03341167792677879, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4674, + "grad_norm": 3.3712387084960938, + "kl": 1.4347156640142202, + "learning_rate": 5.598321369263829e-07, + "loss": 0.1435, + "num_tokens": 20199550.0, + "reward": 0.77764892578125, + "reward_std": 0.014297138899564743, + "rewards//mean": 0.77764892578125, + "rewards//std": 0.04165681451559067, + "step": 2337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4676, + "grad_norm": 2.6326801776885986, + "kl": 1.5405096355825663, + "learning_rate": 5.595170725265516e-07, + "loss": 0.1541, + "num_tokens": 20208166.0, + "reward": 0.75933837890625, + "reward_std": 0.012290691025555134, + "rewards//mean": 0.75933837890625, + "rewards//std": 0.029835429042577744, + "step": 2338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4678, + "grad_norm": 7.534930229187012, + "kl": 1.6253632605075836, + "learning_rate": 5.592019841532506e-07, + "loss": 0.1625, + "num_tokens": 20216830.0, + "reward": 0.73760986328125, + "reward_std": 0.004515606909990311, + "rewards//mean": 0.73760986328125, + "rewards//std": 0.034474872052669525, + "step": 2339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.468, + "grad_norm": 5.634191513061523, + "kl": 0.7963573597371578, + "learning_rate": 5.588868719333974e-07, + "loss": 0.0796, + "num_tokens": 20225406.0, + "reward": 0.76348876953125, + "reward_std": 0.0066134692169725895, + "rewards//mean": 0.76348876953125, + "rewards//std": 0.025844957679510117, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4682, + "grad_norm": 3.624253273010254, + "kl": 1.9819969702512026, + "learning_rate": 5.585717359939192e-07, + "loss": 0.1982, + "num_tokens": 20234062.0, + "reward": 0.74481201171875, + "reward_std": 0.015535264275968075, + "rewards//mean": 0.74481201171875, + "rewards//std": 0.03488343954086304, + "step": 2341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4684, + "grad_norm": 5.5670318603515625, + "kl": 2.168871747329831, + "learning_rate": 5.582565764617527e-07, + "loss": 0.2169, + "num_tokens": 20242662.0, + "reward": 0.76324462890625, + "reward_std": 0.016797028481960297, + "rewards//mean": 0.76324462890625, + "rewards//std": 0.0351363830268383, + "step": 2342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4686, + "grad_norm": 5.1122846603393555, + "kl": 2.214338503777981, + "learning_rate": 5.579413934638442e-07, + "loss": 0.2214, + "num_tokens": 20251350.0, + "reward": 0.75897216796875, + "reward_std": 0.014462747611105442, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.03244909271597862, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4688, + "grad_norm": 11.98872184753418, + "kl": 1.3728271946310997, + "learning_rate": 5.576261871271494e-07, + "loss": 0.1373, + "num_tokens": 20259998.0, + "reward": 0.7806396484375, + "reward_std": 0.006320142187178135, + "rewards//mean": 0.7806396484375, + "rewards//std": 0.022143280133605003, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.469, + "grad_norm": 4.2399067878723145, + "kl": 1.5247728023678064, + "learning_rate": 5.573109575786333e-07, + "loss": 0.1525, + "num_tokens": 20268678.0, + "reward": 0.76953125, + "reward_std": 0.006677086930721998, + "rewards//mean": 0.76953125, + "rewards//std": 0.036003757268190384, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4692, + "grad_norm": 5.6891560554504395, + "kl": 1.9210570957511663, + "learning_rate": 5.569957049452702e-07, + "loss": 0.1921, + "num_tokens": 20277302.0, + "reward": 0.7373046875, + "reward_std": 0.013037643395364285, + "rewards//mean": 0.7373046875, + "rewards//std": 0.03433721512556076, + "step": 2346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4694, + "grad_norm": 7.659826278686523, + "kl": 2.3831970505416393, + "learning_rate": 5.566804293540443e-07, + "loss": 0.2383, + "num_tokens": 20286022.0, + "reward": 0.72149658203125, + "reward_std": 0.011374297551810741, + "rewards//mean": 0.72149658203125, + "rewards//std": 0.042230650782585144, + "step": 2347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4696, + "grad_norm": 6.007536888122559, + "kl": 1.6840916611254215, + "learning_rate": 5.563651309319479e-07, + "loss": 0.1684, + "num_tokens": 20294686.0, + "reward": 0.71881103515625, + "reward_std": 0.005094154272228479, + "rewards//mean": 0.71881103515625, + "rewards//std": 0.033483635634183884, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4698, + "grad_norm": 3.593541145324707, + "kl": 1.4735087547451258, + "learning_rate": 5.560498098059837e-07, + "loss": 0.1474, + "num_tokens": 20303326.0, + "reward": 0.764404296875, + "reward_std": 0.01063027698546648, + "rewards//mean": 0.764404296875, + "rewards//std": 0.03827248886227608, + "step": 2349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.47, + "grad_norm": 7.469547271728516, + "kl": 1.8092053569853306, + "learning_rate": 5.557344661031627e-07, + "loss": 0.1809, + "num_tokens": 20312006.0, + "reward": 0.7696533203125, + "reward_std": 0.008232003077864647, + "rewards//mean": 0.7696533203125, + "rewards//std": 0.03077334351837635, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4702, + "grad_norm": 3.0295908451080322, + "kl": 1.1302433591336012, + "learning_rate": 5.554190999505055e-07, + "loss": 0.113, + "num_tokens": 20320638.0, + "reward": 0.7838134765625, + "reward_std": 0.009080699644982815, + "rewards//mean": 0.7838134765625, + "rewards//std": 0.024282259866595268, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4704, + "grad_norm": 3.3748440742492676, + "kl": 1.6492799259722233, + "learning_rate": 5.551037114750414e-07, + "loss": 0.1649, + "num_tokens": 20329318.0, + "reward": 0.7906494140625, + "reward_std": 0.01591879315674305, + "rewards//mean": 0.7906494140625, + "rewards//std": 0.029820013791322708, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4706, + "grad_norm": 3.730609893798828, + "kl": 1.2904148362576962, + "learning_rate": 5.54788300803809e-07, + "loss": 0.129, + "num_tokens": 20337934.0, + "reward": 0.781494140625, + "reward_std": 0.012361861765384674, + "rewards//mean": 0.781494140625, + "rewards//std": 0.033896174281835556, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4708, + "grad_norm": 16.547056198120117, + "kl": 1.7161910571157932, + "learning_rate": 5.544728680638556e-07, + "loss": 0.1716, + "num_tokens": 20346558.0, + "reward": 0.7236328125, + "reward_std": 0.011692550964653492, + "rewards//mean": 0.7236328125, + "rewards//std": 0.03303584083914757, + "step": 2354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.471, + "grad_norm": 13.908448219299316, + "kl": 2.5406471360474825, + "learning_rate": 5.541574133822373e-07, + "loss": 0.2541, + "num_tokens": 20355190.0, + "reward": 0.77227783203125, + "reward_std": 0.01633216254413128, + "rewards//mean": 0.77227783203125, + "rewards//std": 0.03421482443809509, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4712, + "grad_norm": 6.073451995849609, + "kl": 1.8476961888372898, + "learning_rate": 5.538419368860195e-07, + "loss": 0.1848, + "num_tokens": 20363830.0, + "reward": 0.7569580078125, + "reward_std": 0.008523719385266304, + "rewards//mean": 0.7569580078125, + "rewards//std": 0.034291114658117294, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4714, + "grad_norm": 1.997670292854309, + "kl": 0.5726965721696615, + "learning_rate": 5.535264387022759e-07, + "loss": 0.0573, + "num_tokens": 20372358.0, + "reward": 0.7528076171875, + "reward_std": 0.0043338630348443985, + "rewards//mean": 0.7528076171875, + "rewards//std": 0.0379776731133461, + "step": 2357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4716, + "grad_norm": 5.853262901306152, + "kl": 1.6502835061401129, + "learning_rate": 5.532109189580892e-07, + "loss": 0.165, + "num_tokens": 20381030.0, + "reward": 0.76513671875, + "reward_std": 0.00912112183868885, + "rewards//mean": 0.76513671875, + "rewards//std": 0.03212517872452736, + "step": 2358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4718, + "grad_norm": 2.3283820152282715, + "kl": 1.8510777559131384, + "learning_rate": 5.528953777805507e-07, + "loss": 0.1851, + "num_tokens": 20389582.0, + "reward": 0.732177734375, + "reward_std": 0.013040545396506786, + "rewards//mean": 0.732177734375, + "rewards//std": 0.038202811032533646, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.472, + "grad_norm": 7.533634185791016, + "kl": 1.0923496261239052, + "learning_rate": 5.525798152967605e-07, + "loss": 0.1092, + "num_tokens": 20398174.0, + "reward": 0.76141357421875, + "reward_std": 0.010124849155545235, + "rewards//mean": 0.76141357421875, + "rewards//std": 0.02175803855061531, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4722, + "grad_norm": 5.459146022796631, + "kl": 1.9826487191021442, + "learning_rate": 5.522642316338268e-07, + "loss": 0.1983, + "num_tokens": 20406910.0, + "reward": 0.77264404296875, + "reward_std": 0.009997588582336903, + "rewards//mean": 0.77264404296875, + "rewards//std": 0.03577551990747452, + "step": 2361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4724, + "grad_norm": 17.916576385498047, + "kl": 2.8776147179305553, + "learning_rate": 5.519486269188669e-07, + "loss": 0.2878, + "num_tokens": 20415590.0, + "reward": 0.73516845703125, + "reward_std": 0.011953383684158325, + "rewards//mean": 0.73516845703125, + "rewards//std": 0.03097894974052906, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4726, + "grad_norm": 7.320766448974609, + "kl": 1.163089882582426, + "learning_rate": 5.516330012790062e-07, + "loss": 0.1163, + "num_tokens": 20424278.0, + "reward": 0.75311279296875, + "reward_std": 0.0022918626200407743, + "rewards//mean": 0.75311279296875, + "rewards//std": 0.028731774538755417, + "step": 2363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4728, + "grad_norm": 3.730656385421753, + "kl": 1.4389889277517796, + "learning_rate": 5.513173548413789e-07, + "loss": 0.1439, + "num_tokens": 20432950.0, + "reward": 0.769775390625, + "reward_std": 0.006673668976873159, + "rewards//mean": 0.769775390625, + "rewards//std": 0.030168499797582626, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.473, + "grad_norm": 3.7948830127716064, + "kl": 1.1013858653604984, + "learning_rate": 5.51001687733127e-07, + "loss": 0.1101, + "num_tokens": 20441566.0, + "reward": 0.76507568359375, + "reward_std": 0.008471095934510231, + "rewards//mean": 0.76507568359375, + "rewards//std": 0.02918913960456848, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4732, + "grad_norm": 1.7461118698120117, + "kl": 0.8687972743064165, + "learning_rate": 5.506860000814017e-07, + "loss": 0.0869, + "num_tokens": 20450190.0, + "reward": 0.75543212890625, + "reward_std": 0.006178555078804493, + "rewards//mean": 0.75543212890625, + "rewards//std": 0.023121394217014313, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4734, + "grad_norm": 5.307165145874023, + "kl": 2.168257789686322, + "learning_rate": 5.503702920133614e-07, + "loss": 0.2168, + "num_tokens": 20458830.0, + "reward": 0.7569580078125, + "reward_std": 0.016736673191189766, + "rewards//mean": 0.7569580078125, + "rewards//std": 0.03543737530708313, + "step": 2367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4736, + "grad_norm": 2.408681869506836, + "kl": 0.859221825376153, + "learning_rate": 5.500545636561736e-07, + "loss": 0.0859, + "num_tokens": 20467398.0, + "reward": 0.7349853515625, + "reward_std": 0.005485980771481991, + "rewards//mean": 0.7349853515625, + "rewards//std": 0.028316037729382515, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4738, + "grad_norm": 2.8751676082611084, + "kl": 0.7352927830070257, + "learning_rate": 5.497388151370135e-07, + "loss": 0.0735, + "num_tokens": 20475902.0, + "reward": 0.73931884765625, + "reward_std": 0.008368296548724174, + "rewards//mean": 0.73931884765625, + "rewards//std": 0.028580166399478912, + "step": 2369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.474, + "grad_norm": 5.824247360229492, + "kl": 1.7643723916262388, + "learning_rate": 5.494230465830647e-07, + "loss": 0.1764, + "num_tokens": 20484750.0, + "reward": 0.7744140625, + "reward_std": 0.010680560022592545, + "rewards//mean": 0.7744140625, + "rewards//std": 0.04264908656477928, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4742, + "grad_norm": 3.222755193710327, + "kl": 1.3829617220908403, + "learning_rate": 5.491072581215186e-07, + "loss": 0.1383, + "num_tokens": 20493390.0, + "reward": 0.7486572265625, + "reward_std": 0.012149108573794365, + "rewards//mean": 0.7486572265625, + "rewards//std": 0.03266141563653946, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4744, + "grad_norm": 3.4554693698883057, + "kl": 1.597306763753295, + "learning_rate": 5.487914498795747e-07, + "loss": 0.1597, + "num_tokens": 20501966.0, + "reward": 0.71661376953125, + "reward_std": 0.01048105675727129, + "rewards//mean": 0.71661376953125, + "rewards//std": 0.030520137399435043, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4746, + "grad_norm": 4.405734062194824, + "kl": 0.8742990344762802, + "learning_rate": 5.484756219844407e-07, + "loss": 0.0874, + "num_tokens": 20510662.0, + "reward": 0.76708984375, + "reward_std": 0.006215992383658886, + "rewards//mean": 0.76708984375, + "rewards//std": 0.018823999911546707, + "step": 2373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4748, + "grad_norm": 6.031327247619629, + "kl": 1.6523740869015455, + "learning_rate": 5.48159774563332e-07, + "loss": 0.1652, + "num_tokens": 20519302.0, + "reward": 0.76385498046875, + "reward_std": 0.012404412031173706, + "rewards//mean": 0.76385498046875, + "rewards//std": 0.036411333829164505, + "step": 2374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.475, + "grad_norm": 3.2618987560272217, + "kl": 2.257495630532503, + "learning_rate": 5.478439077434717e-07, + "loss": 0.2257, + "num_tokens": 20527854.0, + "reward": 0.74737548828125, + "reward_std": 0.02104460448026657, + "rewards//mean": 0.74737548828125, + "rewards//std": 0.03624549135565758, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4752, + "grad_norm": 2.385925531387329, + "kl": 1.414399253204465, + "learning_rate": 5.475280216520912e-07, + "loss": 0.1414, + "num_tokens": 20536462.0, + "reward": 0.75140380859375, + "reward_std": 0.009587482549250126, + "rewards//mean": 0.75140380859375, + "rewards//std": 0.03214678540825844, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4754, + "grad_norm": 1.7421376705169678, + "kl": 1.2399223744869232, + "learning_rate": 5.472121164164295e-07, + "loss": 0.124, + "num_tokens": 20545118.0, + "reward": 0.74566650390625, + "reward_std": 0.005600661039352417, + "rewards//mean": 0.74566650390625, + "rewards//std": 0.03243742883205414, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4756, + "grad_norm": 2.338890552520752, + "kl": 1.129277614876628, + "learning_rate": 5.468961921637326e-07, + "loss": 0.1129, + "num_tokens": 20553774.0, + "reward": 0.749755859375, + "reward_std": 0.008636845275759697, + "rewards//mean": 0.749755859375, + "rewards//std": 0.03151145577430725, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4758, + "grad_norm": 6.848686218261719, + "kl": 0.9416264984756708, + "learning_rate": 5.465802490212554e-07, + "loss": 0.0942, + "num_tokens": 20562390.0, + "reward": 0.765869140625, + "reward_std": 0.005401856731623411, + "rewards//mean": 0.765869140625, + "rewards//std": 0.025722326710820198, + "step": 2379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.476, + "grad_norm": 2.9308903217315674, + "kl": 1.2491926103830338, + "learning_rate": 5.462642871162592e-07, + "loss": 0.1249, + "num_tokens": 20570982.0, + "reward": 0.75927734375, + "reward_std": 0.00913756899535656, + "rewards//mean": 0.75927734375, + "rewards//std": 0.031408485025167465, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4762, + "grad_norm": 2.080799102783203, + "kl": 1.2052655890583992, + "learning_rate": 5.459483065760138e-07, + "loss": 0.1205, + "num_tokens": 20579638.0, + "reward": 0.79754638671875, + "reward_std": 0.005787097383290529, + "rewards//mean": 0.79754638671875, + "rewards//std": 0.023974450305104256, + "step": 2381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4764, + "grad_norm": 1.8432962894439697, + "kl": 0.8602310474961996, + "learning_rate": 5.456323075277959e-07, + "loss": 0.086, + "num_tokens": 20588214.0, + "reward": 0.78472900390625, + "reward_std": 0.005707837641239166, + "rewards//mean": 0.78472900390625, + "rewards//std": 0.028349362313747406, + "step": 2382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4766, + "grad_norm": 3.033191680908203, + "kl": 1.041666615754366, + "learning_rate": 5.453162900988901e-07, + "loss": 0.1042, + "num_tokens": 20596910.0, + "reward": 0.7398681640625, + "reward_std": 0.004075206816196442, + "rewards//mean": 0.7398681640625, + "rewards//std": 0.02657306380569935, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4768, + "grad_norm": 3.115701913833618, + "kl": 1.5501028411090374, + "learning_rate": 5.45000254416588e-07, + "loss": 0.155, + "num_tokens": 20605598.0, + "reward": 0.7596435546875, + "reward_std": 0.007988456636667252, + "rewards//mean": 0.7596435546875, + "rewards//std": 0.024665741249918938, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.477, + "grad_norm": 4.01870059967041, + "kl": 1.2279795445501804, + "learning_rate": 5.446842006081888e-07, + "loss": 0.1228, + "num_tokens": 20614302.0, + "reward": 0.7421875, + "reward_std": 0.009591257199645042, + "rewards//mean": 0.7421875, + "rewards//std": 0.031218983232975006, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4772, + "grad_norm": 2.4988017082214355, + "kl": 1.0164373964071274, + "learning_rate": 5.443681288009991e-07, + "loss": 0.1016, + "num_tokens": 20622870.0, + "reward": 0.75128173828125, + "reward_std": 0.006798489019274712, + "rewards//mean": 0.75128173828125, + "rewards//std": 0.027233289554715157, + "step": 2386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4774, + "grad_norm": 4.255586624145508, + "kl": 1.8953668642789125, + "learning_rate": 5.440520391223322e-07, + "loss": 0.1895, + "num_tokens": 20631550.0, + "reward": 0.74102783203125, + "reward_std": 0.012871386483311653, + "rewards//mean": 0.74102783203125, + "rewards//std": 0.04220196232199669, + "step": 2387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4776, + "grad_norm": 3.2451162338256836, + "kl": 1.3147853147238493, + "learning_rate": 5.437359316995093e-07, + "loss": 0.1315, + "num_tokens": 20640134.0, + "reward": 0.78338623046875, + "reward_std": 0.011636747978627682, + "rewards//mean": 0.78338623046875, + "rewards//std": 0.0326889231801033, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4778, + "grad_norm": 3.346529960632324, + "kl": 1.4629386942833662, + "learning_rate": 5.434198066598584e-07, + "loss": 0.1463, + "num_tokens": 20648814.0, + "reward": 0.7381591796875, + "reward_std": 0.010226997546851635, + "rewards//mean": 0.7381591796875, + "rewards//std": 0.03488284349441528, + "step": 2389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.478, + "grad_norm": 3.151991367340088, + "kl": 0.9680822752416134, + "learning_rate": 5.431036641307145e-07, + "loss": 0.0968, + "num_tokens": 20657366.0, + "reward": 0.76751708984375, + "reward_std": 0.010263264179229736, + "rewards//mean": 0.76751708984375, + "rewards//std": 0.03338584303855896, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4782, + "grad_norm": 5.113560676574707, + "kl": 1.6523213759064674, + "learning_rate": 5.427875042394199e-07, + "loss": 0.1652, + "num_tokens": 20666054.0, + "reward": 0.7545166015625, + "reward_std": 0.010957421734929085, + "rewards//mean": 0.7545166015625, + "rewards//std": 0.04223374277353287, + "step": 2391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4784, + "grad_norm": 1.4752440452575684, + "kl": 0.9242366477847099, + "learning_rate": 5.424713271133236e-07, + "loss": 0.0924, + "num_tokens": 20674678.0, + "reward": 0.76971435546875, + "reward_std": 0.010118639096617699, + "rewards//mean": 0.76971435546875, + "rewards//std": 0.029168905690312386, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4786, + "grad_norm": 4.6253814697265625, + "kl": 1.3969186749309301, + "learning_rate": 5.421551328797819e-07, + "loss": 0.1397, + "num_tokens": 20683310.0, + "reward": 0.73516845703125, + "reward_std": 0.006573447026312351, + "rewards//mean": 0.73516845703125, + "rewards//std": 0.03290493041276932, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4788, + "grad_norm": 6.035991191864014, + "kl": 1.8346667736768723, + "learning_rate": 5.418389216661578e-07, + "loss": 0.1835, + "num_tokens": 20692062.0, + "reward": 0.7674560546875, + "reward_std": 0.01070532388985157, + "rewards//mean": 0.7674560546875, + "rewards//std": 0.03357376530766487, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.479, + "grad_norm": 3.1363930702209473, + "kl": 0.8946264553815126, + "learning_rate": 5.41522693599821e-07, + "loss": 0.0895, + "num_tokens": 20700630.0, + "reward": 0.73858642578125, + "reward_std": 0.006060024257749319, + "rewards//mean": 0.73858642578125, + "rewards//std": 0.028690651059150696, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4792, + "grad_norm": 5.421539783477783, + "kl": 1.6835479103028774, + "learning_rate": 5.412064488081481e-07, + "loss": 0.1684, + "num_tokens": 20709326.0, + "reward": 0.7412109375, + "reward_std": 0.01485502254217863, + "rewards//mean": 0.7412109375, + "rewards//std": 0.043990932404994965, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4794, + "grad_norm": 3.760629892349243, + "kl": 0.90433101169765, + "learning_rate": 5.408901874185225e-07, + "loss": 0.0904, + "num_tokens": 20717990.0, + "reward": 0.7789306640625, + "reward_std": 0.009347447194159031, + "rewards//mean": 0.7789306640625, + "rewards//std": 0.023955894634127617, + "step": 2397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4796, + "grad_norm": 3.915766954421997, + "kl": 1.259184930473566, + "learning_rate": 5.405739095583344e-07, + "loss": 0.1259, + "num_tokens": 20726470.0, + "reward": 0.73699951171875, + "reward_std": 0.0057004536502063274, + "rewards//mean": 0.73699951171875, + "rewards//std": 0.031879622489213943, + "step": 2398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4798, + "grad_norm": 3.1569457054138184, + "kl": 0.7055944092571735, + "learning_rate": 5.402576153549804e-07, + "loss": 0.0706, + "num_tokens": 20735102.0, + "reward": 0.7777099609375, + "reward_std": 0.004905324894934893, + "rewards//mean": 0.7777099609375, + "rewards//std": 0.02233387529850006, + "step": 2399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.48, + "grad_norm": 3.7481210231781006, + "kl": 0.8484075181186199, + "learning_rate": 5.399413049358637e-07, + "loss": 0.0848, + "num_tokens": 20743766.0, + "reward": 0.739013671875, + "reward_std": 0.007856871001422405, + "rewards//mean": 0.739013671875, + "rewards//std": 0.03247293457388878, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4802, + "grad_norm": 0.8663788437843323, + "kl": 1.1256586238741875, + "learning_rate": 5.396249784283942e-07, + "loss": 0.1126, + "num_tokens": 20752446.0, + "reward": 0.7745361328125, + "reward_std": 0.006847010459750891, + "rewards//mean": 0.7745361328125, + "rewards//std": 0.02537720836699009, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4804, + "grad_norm": 3.7225022315979004, + "kl": 1.1924304328858852, + "learning_rate": 5.393086359599881e-07, + "loss": 0.1192, + "num_tokens": 20761030.0, + "reward": 0.7216796875, + "reward_std": 0.005747835151851177, + "rewards//mean": 0.7216796875, + "rewards//std": 0.04344244301319122, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4806, + "grad_norm": 5.366184711456299, + "kl": 1.1424797587096691, + "learning_rate": 5.389922776580681e-07, + "loss": 0.1142, + "num_tokens": 20769654.0, + "reward": 0.74951171875, + "reward_std": 0.008655513636767864, + "rewards//mean": 0.74951171875, + "rewards//std": 0.027878835797309875, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4808, + "grad_norm": 4.726572036743164, + "kl": 1.9362168963998556, + "learning_rate": 5.386759036500634e-07, + "loss": 0.1936, + "num_tokens": 20778302.0, + "reward": 0.7403564453125, + "reward_std": 0.01720508560538292, + "rewards//mean": 0.7403564453125, + "rewards//std": 0.040974825620651245, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.481, + "grad_norm": 3.2456603050231934, + "kl": 1.4790656119585037, + "learning_rate": 5.383595140634093e-07, + "loss": 0.1479, + "num_tokens": 20786974.0, + "reward": 0.74285888671875, + "reward_std": 0.010712843388319016, + "rewards//mean": 0.74285888671875, + "rewards//std": 0.029261136427521706, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4812, + "grad_norm": 1.3181790113449097, + "kl": 1.2699118461459875, + "learning_rate": 5.380431090255475e-07, + "loss": 0.127, + "num_tokens": 20795574.0, + "reward": 0.75799560546875, + "reward_std": 0.0057169292122125626, + "rewards//mean": 0.75799560546875, + "rewards//std": 0.02426440827548504, + "step": 2406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4814, + "grad_norm": 0.7074439525604248, + "kl": 0.8004306070506573, + "learning_rate": 5.377266886639259e-07, + "loss": 0.08, + "num_tokens": 20804230.0, + "reward": 0.786376953125, + "reward_std": 0.002361732069402933, + "rewards//mean": 0.786376953125, + "rewards//std": 0.02509317174553871, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4816, + "grad_norm": 3.124009847640991, + "kl": 1.0801638569682837, + "learning_rate": 5.374102531059987e-07, + "loss": 0.108, + "num_tokens": 20812790.0, + "reward": 0.77008056640625, + "reward_std": 0.004977153614163399, + "rewards//mean": 0.77008056640625, + "rewards//std": 0.01872553490102291, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4818, + "grad_norm": 4.23710823059082, + "kl": 0.9954422302544117, + "learning_rate": 5.370938024792261e-07, + "loss": 0.0995, + "num_tokens": 20821414.0, + "reward": 0.7711181640625, + "reward_std": 0.01230654213577509, + "rewards//mean": 0.7711181640625, + "rewards//std": 0.021773774176836014, + "step": 2409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.482, + "grad_norm": 5.235261917114258, + "kl": 1.7377068027853966, + "learning_rate": 5.367773369110741e-07, + "loss": 0.1738, + "num_tokens": 20830046.0, + "reward": 0.75701904296875, + "reward_std": 0.008126323111355305, + "rewards//mean": 0.75701904296875, + "rewards//std": 0.028528742492198944, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4822, + "grad_norm": 2.322641134262085, + "kl": 1.1099243760108948, + "learning_rate": 5.364608565290154e-07, + "loss": 0.111, + "num_tokens": 20838622.0, + "reward": 0.74981689453125, + "reward_std": 0.009127575904130936, + "rewards//mean": 0.74981689453125, + "rewards//std": 0.02691458910703659, + "step": 2411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4824, + "grad_norm": 5.99852180480957, + "kl": 1.0340879615396261, + "learning_rate": 5.361443614605278e-07, + "loss": 0.1034, + "num_tokens": 20847286.0, + "reward": 0.741455078125, + "reward_std": 0.005980014801025391, + "rewards//mean": 0.741455078125, + "rewards//std": 0.02082025073468685, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4826, + "grad_norm": 6.124125003814697, + "kl": 1.9533657357096672, + "learning_rate": 5.358278518330959e-07, + "loss": 0.1953, + "num_tokens": 20856046.0, + "reward": 0.76837158203125, + "reward_std": 0.01573370024561882, + "rewards//mean": 0.76837158203125, + "rewards//std": 0.03423605486750603, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4828, + "grad_norm": 5.601797580718994, + "kl": 0.7532588988542557, + "learning_rate": 5.355113277742095e-07, + "loss": 0.0753, + "num_tokens": 20864718.0, + "reward": 0.7572021484375, + "reward_std": 0.005440001841634512, + "rewards//mean": 0.7572021484375, + "rewards//std": 0.02801506221294403, + "step": 2414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.483, + "grad_norm": 1.8941010236740112, + "kl": 1.2369064036756754, + "learning_rate": 5.351947894113645e-07, + "loss": 0.1237, + "num_tokens": 20873358.0, + "reward": 0.75384521484375, + "reward_std": 0.005494498647749424, + "rewards//mean": 0.75384521484375, + "rewards//std": 0.027809178456664085, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4832, + "grad_norm": 7.199653148651123, + "kl": 2.1940044686198235, + "learning_rate": 5.348782368720625e-07, + "loss": 0.2194, + "num_tokens": 20882038.0, + "reward": 0.73101806640625, + "reward_std": 0.01259165070950985, + "rewards//mean": 0.73101806640625, + "rewards//std": 0.025457236915826797, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4834, + "grad_norm": 6.638148784637451, + "kl": 1.1568494327366352, + "learning_rate": 5.34561670283811e-07, + "loss": 0.1157, + "num_tokens": 20890646.0, + "reward": 0.7230224609375, + "reward_std": 0.008421721868216991, + "rewards//mean": 0.7230224609375, + "rewards//std": 0.033701568841934204, + "step": 2417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4836, + "grad_norm": 0.5360536575317383, + "kl": 0.4617897365242243, + "learning_rate": 5.342450897741228e-07, + "loss": 0.0462, + "num_tokens": 20899342.0, + "reward": 0.76800537109375, + "reward_std": 0.0023699593730270863, + "rewards//mean": 0.76800537109375, + "rewards//std": 0.018527235835790634, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4838, + "grad_norm": 2.7877962589263916, + "kl": 1.2675588093698025, + "learning_rate": 5.339284954705165e-07, + "loss": 0.1268, + "num_tokens": 20907950.0, + "reward": 0.7613525390625, + "reward_std": 0.009189009666442871, + "rewards//mean": 0.7613525390625, + "rewards//std": 0.023548007011413574, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.484, + "grad_norm": 2.219937562942505, + "kl": 1.302821209654212, + "learning_rate": 5.336118875005164e-07, + "loss": 0.1303, + "num_tokens": 20916646.0, + "reward": 0.76959228515625, + "reward_std": 0.01067274808883667, + "rewards//mean": 0.76959228515625, + "rewards//std": 0.03654785454273224, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4842, + "grad_norm": 3.487543821334839, + "kl": 1.1214262414723635, + "learning_rate": 5.33295265991652e-07, + "loss": 0.1121, + "num_tokens": 20925278.0, + "reward": 0.78662109375, + "reward_std": 0.008769930340349674, + "rewards//mean": 0.78662109375, + "rewards//std": 0.027809247374534607, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4844, + "grad_norm": 2.463282823562622, + "kl": 1.2247951347380877, + "learning_rate": 5.329786310714582e-07, + "loss": 0.1225, + "num_tokens": 20933974.0, + "reward": 0.7578125, + "reward_std": 0.008317554369568825, + "rewards//mean": 0.7578125, + "rewards//std": 0.03292568400502205, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4846, + "grad_norm": 6.546343803405762, + "kl": 1.7280665580183268, + "learning_rate": 5.326619828674761e-07, + "loss": 0.1728, + "num_tokens": 20942590.0, + "reward": 0.7662353515625, + "reward_std": 0.009146787226200104, + "rewards//mean": 0.7662353515625, + "rewards//std": 0.029554864391684532, + "step": 2423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4848, + "grad_norm": 2.5868356227874756, + "kl": 1.1923370379954576, + "learning_rate": 5.323453215072509e-07, + "loss": 0.1192, + "num_tokens": 20951278.0, + "reward": 0.736083984375, + "reward_std": 0.00791519321501255, + "rewards//mean": 0.736083984375, + "rewards//std": 0.029305141419172287, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.485, + "grad_norm": 3.408597707748413, + "kl": 1.166304750367999, + "learning_rate": 5.320286471183343e-07, + "loss": 0.1166, + "num_tokens": 20959934.0, + "reward": 0.76416015625, + "reward_std": 0.00750743318349123, + "rewards//mean": 0.76416015625, + "rewards//std": 0.0238117054104805, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4852, + "grad_norm": 1.0867911577224731, + "kl": 1.1646274402737617, + "learning_rate": 5.317119598282822e-07, + "loss": 0.1165, + "num_tokens": 20968454.0, + "reward": 0.7314453125, + "reward_std": 0.008880149573087692, + "rewards//mean": 0.7314453125, + "rewards//std": 0.03119570016860962, + "step": 2426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4854, + "grad_norm": 5.935849666595459, + "kl": 0.8935622684657574, + "learning_rate": 5.313952597646567e-07, + "loss": 0.0894, + "num_tokens": 20977110.0, + "reward": 0.7574462890625, + "reward_std": 0.008292323909699917, + "rewards//mean": 0.7574462890625, + "rewards//std": 0.02793280966579914, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4856, + "grad_norm": 1.2621906995773315, + "kl": 1.2011620700359344, + "learning_rate": 5.310785470550242e-07, + "loss": 0.1201, + "num_tokens": 20985726.0, + "reward": 0.778076171875, + "reward_std": 0.008557998575270176, + "rewards//mean": 0.778076171875, + "rewards//std": 0.027426665648818016, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4858, + "grad_norm": 7.533359050750732, + "kl": 1.9700902421027422, + "learning_rate": 5.307618218269568e-07, + "loss": 0.197, + "num_tokens": 20994430.0, + "reward": 0.76513671875, + "reward_std": 0.013977913185954094, + "rewards//mean": 0.76513671875, + "rewards//std": 0.03700234740972519, + "step": 2429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.486, + "grad_norm": 10.27749252319336, + "kl": 2.4508003666996956, + "learning_rate": 5.304450842080312e-07, + "loss": 0.2451, + "num_tokens": 21003126.0, + "reward": 0.74603271484375, + "reward_std": 0.013629911467432976, + "rewards//mean": 0.74603271484375, + "rewards//std": 0.023297492414712906, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4862, + "grad_norm": 3.78019380569458, + "kl": 1.7588821966201067, + "learning_rate": 5.301283343258292e-07, + "loss": 0.1759, + "num_tokens": 21011710.0, + "reward": 0.73291015625, + "reward_std": 0.012177502736449242, + "rewards//mean": 0.73291015625, + "rewards//std": 0.026534585282206535, + "step": 2431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4864, + "grad_norm": 24.54292106628418, + "kl": 0.7830894161015749, + "learning_rate": 5.298115723079379e-07, + "loss": 0.0783, + "num_tokens": 21020398.0, + "reward": 0.7369384765625, + "reward_std": 0.010155413299798965, + "rewards//mean": 0.7369384765625, + "rewards//std": 0.023563429713249207, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4866, + "grad_norm": 3.6233930587768555, + "kl": 1.2621662579476833, + "learning_rate": 5.294947982819487e-07, + "loss": 0.1262, + "num_tokens": 21029070.0, + "reward": 0.74822998046875, + "reward_std": 0.008707704953849316, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.03012073040008545, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4868, + "grad_norm": 5.497499465942383, + "kl": 1.2038103509694338, + "learning_rate": 5.291780123754585e-07, + "loss": 0.1204, + "num_tokens": 21037654.0, + "reward": 0.775146484375, + "reward_std": 0.009904496371746063, + "rewards//mean": 0.775146484375, + "rewards//std": 0.030780475586652756, + "step": 2434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.487, + "grad_norm": 24.257749557495117, + "kl": 1.6646695658564568, + "learning_rate": 5.28861214716068e-07, + "loss": 0.1665, + "num_tokens": 21046414.0, + "reward": 0.7393798828125, + "reward_std": 0.0069559672847390175, + "rewards//mean": 0.7393798828125, + "rewards//std": 0.029695892706513405, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4872, + "grad_norm": 6.960176467895508, + "kl": 1.5309266168624163, + "learning_rate": 5.28544405431384e-07, + "loss": 0.1531, + "num_tokens": 21054990.0, + "reward": 0.75299072265625, + "reward_std": 0.010916500352323055, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.023258473724126816, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4874, + "grad_norm": 20.682844161987305, + "kl": 1.7779922243207693, + "learning_rate": 5.282275846490169e-07, + "loss": 0.1778, + "num_tokens": 21063614.0, + "reward": 0.73382568359375, + "reward_std": 0.011740854009985924, + "rewards//mean": 0.73382568359375, + "rewards//std": 0.03233833983540535, + "step": 2437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4876, + "grad_norm": 6.750876426696777, + "kl": 1.1652787048369646, + "learning_rate": 5.27910752496582e-07, + "loss": 0.1165, + "num_tokens": 21072286.0, + "reward": 0.77471923828125, + "reward_std": 0.012273425236344337, + "rewards//mean": 0.77471923828125, + "rewards//std": 0.03743545338511467, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4878, + "grad_norm": 9.106575012207031, + "kl": 1.8281249310821295, + "learning_rate": 5.275939091016992e-07, + "loss": 0.1828, + "num_tokens": 21080958.0, + "reward": 0.75732421875, + "reward_std": 0.012984936125576496, + "rewards//mean": 0.75732421875, + "rewards//std": 0.028395313769578934, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.488, + "grad_norm": 29.97038459777832, + "kl": 1.6434762328863144, + "learning_rate": 5.272770545919933e-07, + "loss": 0.1643, + "num_tokens": 21089646.0, + "reward": 0.75274658203125, + "reward_std": 0.00879595521837473, + "rewards//mean": 0.75274658203125, + "rewards//std": 0.030888909474015236, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4882, + "grad_norm": 0.9008217453956604, + "kl": 0.6485368497669697, + "learning_rate": 5.26960189095093e-07, + "loss": 0.0649, + "num_tokens": 21098302.0, + "reward": 0.74127197265625, + "reward_std": 0.001820734003558755, + "rewards//mean": 0.74127197265625, + "rewards//std": 0.027034122496843338, + "step": 2441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4884, + "grad_norm": 4.168609142303467, + "kl": 1.6487079933285713, + "learning_rate": 5.266433127386318e-07, + "loss": 0.1649, + "num_tokens": 21106926.0, + "reward": 0.74957275390625, + "reward_std": 0.007807353977113962, + "rewards//mean": 0.74957275390625, + "rewards//std": 0.026467109099030495, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4886, + "grad_norm": 2.889329671859741, + "kl": 0.9238882008939981, + "learning_rate": 5.263264256502474e-07, + "loss": 0.0924, + "num_tokens": 21115654.0, + "reward": 0.75726318359375, + "reward_std": 0.008642888627946377, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.030866848304867744, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4888, + "grad_norm": 12.018540382385254, + "kl": 1.3975722044706345, + "learning_rate": 5.260095279575818e-07, + "loss": 0.1398, + "num_tokens": 21124230.0, + "reward": 0.7637939453125, + "reward_std": 0.00931993406265974, + "rewards//mean": 0.7637939453125, + "rewards//std": 0.03776983544230461, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.489, + "grad_norm": 9.510030746459961, + "kl": 1.4919980596750975, + "learning_rate": 5.256926197882815e-07, + "loss": 0.1492, + "num_tokens": 21132830.0, + "reward": 0.76055908203125, + "reward_std": 0.007172760087996721, + "rewards//mean": 0.76055908203125, + "rewards//std": 0.027758508920669556, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4892, + "grad_norm": 4.738150119781494, + "kl": 0.9700312856584787, + "learning_rate": 5.253757012699971e-07, + "loss": 0.097, + "num_tokens": 21141430.0, + "reward": 0.75726318359375, + "reward_std": 0.006562160328030586, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.025476258248090744, + "step": 2446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4894, + "grad_norm": 2.2031121253967285, + "kl": 1.1507928166538477, + "learning_rate": 5.250587725303831e-07, + "loss": 0.1151, + "num_tokens": 21150102.0, + "reward": 0.75982666015625, + "reward_std": 0.005948235746473074, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.026106618344783783, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4896, + "grad_norm": 7.247396945953369, + "kl": 1.8477562014013529, + "learning_rate": 5.247418336970987e-07, + "loss": 0.1848, + "num_tokens": 21158758.0, + "reward": 0.75238037109375, + "reward_std": 0.010368994437158108, + "rewards//mean": 0.75238037109375, + "rewards//std": 0.03213359788060188, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4898, + "grad_norm": 7.373865127563477, + "kl": 2.3209238946437836, + "learning_rate": 5.244248848978067e-07, + "loss": 0.2321, + "num_tokens": 21167366.0, + "reward": 0.76763916015625, + "reward_std": 0.013626541942358017, + "rewards//mean": 0.76763916015625, + "rewards//std": 0.032000478357076645, + "step": 2449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.49, + "grad_norm": 3.2763195037841797, + "kl": 1.371602788567543, + "learning_rate": 5.241079262601737e-07, + "loss": 0.1372, + "num_tokens": 21176062.0, + "reward": 0.7171630859375, + "reward_std": 0.005923929624259472, + "rewards//mean": 0.7171630859375, + "rewards//std": 0.025814849883317947, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4902, + "grad_norm": 5.07871675491333, + "kl": 1.4598173443228006, + "learning_rate": 5.237909579118712e-07, + "loss": 0.146, + "num_tokens": 21184782.0, + "reward": 0.73199462890625, + "reward_std": 0.00647917203605175, + "rewards//mean": 0.73199462890625, + "rewards//std": 0.04035928472876549, + "step": 2451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4904, + "grad_norm": 5.12959098815918, + "kl": 1.8808238953351974, + "learning_rate": 5.234739799805734e-07, + "loss": 0.1881, + "num_tokens": 21193414.0, + "reward": 0.7183837890625, + "reward_std": 0.01224609836935997, + "rewards//mean": 0.7183837890625, + "rewards//std": 0.03791544586420059, + "step": 2452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4906, + "grad_norm": 2.4597301483154297, + "kl": 1.02066720277071, + "learning_rate": 5.231569925939595e-07, + "loss": 0.1021, + "num_tokens": 21202062.0, + "reward": 0.74755859375, + "reward_std": 0.0064194342121481895, + "rewards//mean": 0.74755859375, + "rewards//std": 0.014440802857279778, + "step": 2453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4908, + "grad_norm": 7.721793174743652, + "kl": 2.9432003386318684, + "learning_rate": 5.228399958797116e-07, + "loss": 0.2943, + "num_tokens": 21210678.0, + "reward": 0.7476806640625, + "reward_std": 0.015330106019973755, + "rewards//mean": 0.7476806640625, + "rewards//std": 0.04340182989835739, + "step": 2454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.491, + "grad_norm": 11.461202621459961, + "kl": 1.874703649431467, + "learning_rate": 5.225229899655163e-07, + "loss": 0.1875, + "num_tokens": 21219254.0, + "reward": 0.726806640625, + "reward_std": 0.007431745529174805, + "rewards//mean": 0.726806640625, + "rewards//std": 0.03505535051226616, + "step": 2455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4912, + "grad_norm": 12.311986923217773, + "kl": 1.7263416480273008, + "learning_rate": 5.222059749790631e-07, + "loss": 0.1726, + "num_tokens": 21227878.0, + "reward": 0.7332763671875, + "reward_std": 0.008239876478910446, + "rewards//mean": 0.7332763671875, + "rewards//std": 0.032327864319086075, + "step": 2456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4914, + "grad_norm": 1.9700359106063843, + "kl": 1.3818424884229898, + "learning_rate": 5.21888951048046e-07, + "loss": 0.1382, + "num_tokens": 21236550.0, + "reward": 0.76513671875, + "reward_std": 0.006688281893730164, + "rewards//mean": 0.76513671875, + "rewards//std": 0.026807021349668503, + "step": 2457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4916, + "grad_norm": 4.7922492027282715, + "kl": 2.221864778548479, + "learning_rate": 5.215719183001619e-07, + "loss": 0.2222, + "num_tokens": 21245302.0, + "reward": 0.7786865234375, + "reward_std": 0.017036166042089462, + "rewards//mean": 0.7786865234375, + "rewards//std": 0.03309604153037071, + "step": 2458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4918, + "grad_norm": 2.8701908588409424, + "kl": 1.5886991918087006, + "learning_rate": 5.212548768631117e-07, + "loss": 0.1589, + "num_tokens": 21253918.0, + "reward": 0.73565673828125, + "reward_std": 0.008597031235694885, + "rewards//mean": 0.73565673828125, + "rewards//std": 0.031489770859479904, + "step": 2459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.492, + "grad_norm": 2.0412251949310303, + "kl": 1.4479617550969124, + "learning_rate": 5.209378268645997e-07, + "loss": 0.1448, + "num_tokens": 21262502.0, + "reward": 0.7530517578125, + "reward_std": 0.011180337518453598, + "rewards//mean": 0.7530517578125, + "rewards//std": 0.03748339042067528, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4922, + "grad_norm": 6.212287902832031, + "kl": 2.1694286093115807, + "learning_rate": 5.206207684323335e-07, + "loss": 0.2169, + "num_tokens": 21271166.0, + "reward": 0.7666015625, + "reward_std": 0.016059689223766327, + "rewards//mean": 0.7666015625, + "rewards//std": 0.03739625960588455, + "step": 2461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4924, + "grad_norm": 2.5854170322418213, + "kl": 1.7593621388077736, + "learning_rate": 5.203037016940245e-07, + "loss": 0.1759, + "num_tokens": 21279854.0, + "reward": 0.75732421875, + "reward_std": 0.014403636567294598, + "rewards//mean": 0.75732421875, + "rewards//std": 0.039244987070560455, + "step": 2462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4926, + "grad_norm": 3.562713861465454, + "kl": 1.3623085115104914, + "learning_rate": 5.199866267773867e-07, + "loss": 0.1362, + "num_tokens": 21288438.0, + "reward": 0.75469970703125, + "reward_std": 0.006049739196896553, + "rewards//mean": 0.75469970703125, + "rewards//std": 0.028841154649853706, + "step": 2463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4928, + "grad_norm": 1.3855479955673218, + "kl": 0.9640926569700241, + "learning_rate": 5.196695438101379e-07, + "loss": 0.0964, + "num_tokens": 21297054.0, + "reward": 0.76922607421875, + "reward_std": 0.005334881134331226, + "rewards//mean": 0.76922607421875, + "rewards//std": 0.02744811400771141, + "step": 2464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.493, + "grad_norm": 2.859048366546631, + "kl": 0.8970019854605198, + "learning_rate": 5.193524529199994e-07, + "loss": 0.0897, + "num_tokens": 21305678.0, + "reward": 0.74859619140625, + "reward_std": 0.003609629347920418, + "rewards//mean": 0.74859619140625, + "rewards//std": 0.029118523001670837, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4932, + "grad_norm": 6.517587184906006, + "kl": 0.7804556041955948, + "learning_rate": 5.19035354234695e-07, + "loss": 0.078, + "num_tokens": 21314342.0, + "reward": 0.78961181640625, + "reward_std": 0.008517956361174583, + "rewards//mean": 0.78961181640625, + "rewards//std": 0.024850621819496155, + "step": 2466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4934, + "grad_norm": 15.03554916381836, + "kl": 0.7857796475291252, + "learning_rate": 5.187182478819523e-07, + "loss": 0.0786, + "num_tokens": 21322862.0, + "reward": 0.78167724609375, + "reward_std": 0.006496594287455082, + "rewards//mean": 0.78167724609375, + "rewards//std": 0.030346542596817017, + "step": 2467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4936, + "grad_norm": 10.269462585449219, + "kl": 2.1855060923844576, + "learning_rate": 5.184011339895015e-07, + "loss": 0.2186, + "num_tokens": 21331414.0, + "reward": 0.7193603515625, + "reward_std": 0.008378187194466591, + "rewards//mean": 0.7193603515625, + "rewards//std": 0.03542199358344078, + "step": 2468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4938, + "grad_norm": 2.0247483253479004, + "kl": 1.0551101323217154, + "learning_rate": 5.180840126850763e-07, + "loss": 0.1055, + "num_tokens": 21339982.0, + "reward": 0.77117919921875, + "reward_std": 0.0048871347680687904, + "rewards//mean": 0.77117919921875, + "rewards//std": 0.023497410118579865, + "step": 2469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.494, + "grad_norm": 3.718127727508545, + "kl": 2.4600570537149906, + "learning_rate": 5.177668840964127e-07, + "loss": 0.246, + "num_tokens": 21348622.0, + "reward": 0.74346923828125, + "reward_std": 0.017451688647270203, + "rewards//mean": 0.74346923828125, + "rewards//std": 0.0296026524156332, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4942, + "grad_norm": 8.385236740112305, + "kl": 1.6674923561513424, + "learning_rate": 5.174497483512505e-07, + "loss": 0.1667, + "num_tokens": 21357278.0, + "reward": 0.774658203125, + "reward_std": 0.009503766894340515, + "rewards//mean": 0.774658203125, + "rewards//std": 0.028081176802515984, + "step": 2471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4944, + "grad_norm": 3.937204360961914, + "kl": 1.1690345369279385, + "learning_rate": 5.171326055773317e-07, + "loss": 0.1169, + "num_tokens": 21365854.0, + "reward": 0.79052734375, + "reward_std": 0.007389562204480171, + "rewards//mean": 0.79052734375, + "rewards//std": 0.02521473541855812, + "step": 2472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4946, + "grad_norm": 13.29330062866211, + "kl": 2.066753152757883, + "learning_rate": 5.168154559024014e-07, + "loss": 0.2067, + "num_tokens": 21374486.0, + "reward": 0.7403564453125, + "reward_std": 0.008129255846142769, + "rewards//mean": 0.7403564453125, + "rewards//std": 0.030851947143673897, + "step": 2473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4948, + "grad_norm": 6.27092981338501, + "kl": 1.494903340935707, + "learning_rate": 5.164982994542076e-07, + "loss": 0.1495, + "num_tokens": 21383086.0, + "reward": 0.7672119140625, + "reward_std": 0.0065413280390203, + "rewards//mean": 0.7672119140625, + "rewards//std": 0.01790364645421505, + "step": 2474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.495, + "grad_norm": 10.738384246826172, + "kl": 1.7184263709932566, + "learning_rate": 5.161811363605005e-07, + "loss": 0.1718, + "num_tokens": 21391718.0, + "reward": 0.78082275390625, + "reward_std": 0.013473032973706722, + "rewards//mean": 0.78082275390625, + "rewards//std": 0.037668049335479736, + "step": 2475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4952, + "grad_norm": 3.391979694366455, + "kl": 1.3498054444789886, + "learning_rate": 5.158639667490338e-07, + "loss": 0.135, + "num_tokens": 21400294.0, + "reward": 0.782958984375, + "reward_std": 0.01300393883138895, + "rewards//mean": 0.782958984375, + "rewards//std": 0.030913952738046646, + "step": 2476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4954, + "grad_norm": 3.0491490364074707, + "kl": 1.2808248046785593, + "learning_rate": 5.155467907475631e-07, + "loss": 0.1281, + "num_tokens": 21408862.0, + "reward": 0.777587890625, + "reward_std": 0.012639385648071766, + "rewards//mean": 0.777587890625, + "rewards//std": 0.02757638320326805, + "step": 2477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4956, + "grad_norm": 17.56730079650879, + "kl": 2.3768493980169296, + "learning_rate": 5.152296084838471e-07, + "loss": 0.2377, + "num_tokens": 21417542.0, + "reward": 0.757080078125, + "reward_std": 0.013099430128932, + "rewards//mean": 0.757080078125, + "rewards//std": 0.039780572056770325, + "step": 2478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4958, + "grad_norm": 3.2529489994049072, + "kl": 1.116445790976286, + "learning_rate": 5.149124200856465e-07, + "loss": 0.1116, + "num_tokens": 21426230.0, + "reward": 0.7427978515625, + "reward_std": 0.007933074608445168, + "rewards//mean": 0.7427978515625, + "rewards//std": 0.02932450734078884, + "step": 2479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.496, + "grad_norm": 3.5040981769561768, + "kl": 1.767238100990653, + "learning_rate": 5.145952256807249e-07, + "loss": 0.1767, + "num_tokens": 21434910.0, + "reward": 0.7271728515625, + "reward_std": 0.004354175645858049, + "rewards//mean": 0.7271728515625, + "rewards//std": 0.020495999604463577, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4962, + "grad_norm": 18.201961517333984, + "kl": 1.389025716111064, + "learning_rate": 5.142780253968481e-07, + "loss": 0.1389, + "num_tokens": 21443566.0, + "reward": 0.75555419921875, + "reward_std": 0.009230272844433784, + "rewards//mean": 0.75555419921875, + "rewards//std": 0.03486781567335129, + "step": 2481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4964, + "grad_norm": 6.012240409851074, + "kl": 1.274559061974287, + "learning_rate": 5.139608193617844e-07, + "loss": 0.1275, + "num_tokens": 21452254.0, + "reward": 0.78106689453125, + "reward_std": 0.00610141409561038, + "rewards//mean": 0.78106689453125, + "rewards//std": 0.02788691222667694, + "step": 2482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4966, + "grad_norm": 22.940805435180664, + "kl": 2.5159899834543467, + "learning_rate": 5.136436077033044e-07, + "loss": 0.2516, + "num_tokens": 21460798.0, + "reward": 0.72625732421875, + "reward_std": 0.012220161035656929, + "rewards//mean": 0.72625732421875, + "rewards//std": 0.04006225988268852, + "step": 2483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4968, + "grad_norm": 4.167654991149902, + "kl": 1.326149519532919, + "learning_rate": 5.133263905491808e-07, + "loss": 0.1326, + "num_tokens": 21469454.0, + "reward": 0.7452392578125, + "reward_std": 0.008931857533752918, + "rewards//mean": 0.7452392578125, + "rewards//std": 0.02446361631155014, + "step": 2484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.497, + "grad_norm": 7.900535583496094, + "kl": 2.1057394705712795, + "learning_rate": 5.130091680271886e-07, + "loss": 0.2106, + "num_tokens": 21478150.0, + "reward": 0.78826904296875, + "reward_std": 0.019098900258541107, + "rewards//mean": 0.78826904296875, + "rewards//std": 0.03416169062256813, + "step": 2485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4972, + "grad_norm": 4.1555023193359375, + "kl": 1.4613208267837763, + "learning_rate": 5.126919402651052e-07, + "loss": 0.1461, + "num_tokens": 21486814.0, + "reward": 0.77056884765625, + "reward_std": 0.013614080846309662, + "rewards//mean": 0.77056884765625, + "rewards//std": 0.04148749262094498, + "step": 2486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4974, + "grad_norm": 6.051153182983398, + "kl": 1.9178170319646597, + "learning_rate": 5.123747073907097e-07, + "loss": 0.1918, + "num_tokens": 21495382.0, + "reward": 0.7935791015625, + "reward_std": 0.01421187911182642, + "rewards//mean": 0.7935791015625, + "rewards//std": 0.02863076888024807, + "step": 2487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4976, + "grad_norm": 13.912383079528809, + "kl": 2.8309224639087915, + "learning_rate": 5.120574695317836e-07, + "loss": 0.2831, + "num_tokens": 21504142.0, + "reward": 0.74176025390625, + "reward_std": 0.013970336876809597, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.03970891982316971, + "step": 2488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4978, + "grad_norm": 8.304642677307129, + "kl": 1.4212545230984688, + "learning_rate": 5.117402268161101e-07, + "loss": 0.1421, + "num_tokens": 21512782.0, + "reward": 0.77886962890625, + "reward_std": 0.0072861951775848866, + "rewards//mean": 0.77886962890625, + "rewards//std": 0.04056878387928009, + "step": 2489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.498, + "grad_norm": 3.470306396484375, + "kl": 1.776834374293685, + "learning_rate": 5.114229793714748e-07, + "loss": 0.1777, + "num_tokens": 21521406.0, + "reward": 0.75457763671875, + "reward_std": 0.010919044725596905, + "rewards//mean": 0.75457763671875, + "rewards//std": 0.03370039165019989, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4982, + "grad_norm": 2.4034996032714844, + "kl": 1.5563668627291918, + "learning_rate": 5.111057273256647e-07, + "loss": 0.1556, + "num_tokens": 21530030.0, + "reward": 0.7620849609375, + "reward_std": 0.011739077977836132, + "rewards//mean": 0.7620849609375, + "rewards//std": 0.032576024532318115, + "step": 2491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4984, + "grad_norm": 4.817319393157959, + "kl": 2.0727260634303093, + "learning_rate": 5.107884708064689e-07, + "loss": 0.2073, + "num_tokens": 21538614.0, + "reward": 0.75823974609375, + "reward_std": 0.016387753188610077, + "rewards//mean": 0.75823974609375, + "rewards//std": 0.03521556779742241, + "step": 2492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4986, + "grad_norm": 4.695708751678467, + "kl": 1.2218250911682844, + "learning_rate": 5.104712099416785e-07, + "loss": 0.1222, + "num_tokens": 21547222.0, + "reward": 0.763916015625, + "reward_std": 0.009938977658748627, + "rewards//mean": 0.763916015625, + "rewards//std": 0.030479997396469116, + "step": 2493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4988, + "grad_norm": 4.104818820953369, + "kl": 1.0049883034080267, + "learning_rate": 5.101539448590858e-07, + "loss": 0.1005, + "num_tokens": 21555862.0, + "reward": 0.76922607421875, + "reward_std": 0.011079177260398865, + "rewards//mean": 0.76922607421875, + "rewards//std": 0.02895953133702278, + "step": 2494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.499, + "grad_norm": 3.331050157546997, + "kl": 1.5768256355077028, + "learning_rate": 5.098366756864855e-07, + "loss": 0.1577, + "num_tokens": 21564470.0, + "reward": 0.75054931640625, + "reward_std": 0.010783007368445396, + "rewards//mean": 0.75054931640625, + "rewards//std": 0.025447722524404526, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4992, + "grad_norm": 3.882225513458252, + "kl": 1.3045880999416113, + "learning_rate": 5.095194025516732e-07, + "loss": 0.1305, + "num_tokens": 21573102.0, + "reward": 0.73992919921875, + "reward_std": 0.009225038811564445, + "rewards//mean": 0.73992919921875, + "rewards//std": 0.0282268188893795, + "step": 2496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4994, + "grad_norm": 2.7224843502044678, + "kl": 1.6920190043747425, + "learning_rate": 5.09202125582447e-07, + "loss": 0.1692, + "num_tokens": 21581718.0, + "reward": 0.75469970703125, + "reward_std": 0.01124764047563076, + "rewards//mean": 0.75469970703125, + "rewards//std": 0.022916875779628754, + "step": 2497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4996, + "grad_norm": 4.1161627769470215, + "kl": 1.1753621995449066, + "learning_rate": 5.088848449066054e-07, + "loss": 0.1175, + "num_tokens": 21590302.0, + "reward": 0.7376708984375, + "reward_std": 0.01029042899608612, + "rewards//mean": 0.7376708984375, + "rewards//std": 0.03090881183743477, + "step": 2498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.4998, + "grad_norm": 7.0367960929870605, + "kl": 1.4255115538835526, + "learning_rate": 5.085675606519497e-07, + "loss": 0.1426, + "num_tokens": 21598910.0, + "reward": 0.76507568359375, + "reward_std": 0.005431991070508957, + "rewards//mean": 0.76507568359375, + "rewards//std": 0.03418782353401184, + "step": 2499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 127.546875, + "epoch": 0.5, + "grad_norm": 14.416479110717773, + "kl": 2.5580179803073406, + "learning_rate": 5.082502729462812e-07, + "loss": 0.2525, + "num_tokens": 21607585.0, + "reward": 0.76202392578125, + "reward_std": 0.00923288706690073, + "rewards//mean": 0.76202392578125, + "rewards//std": 0.02869909070432186, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5002, + "grad_norm": 1.6868007183074951, + "kl": 1.3712705317884684, + "learning_rate": 5.07932981917404e-07, + "loss": 0.1371, + "num_tokens": 21616297.0, + "reward": 0.78607177734375, + "reward_std": 0.008597662672400475, + "rewards//mean": 0.78607177734375, + "rewards//std": 0.02900497056543827, + "step": 2501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5004, + "grad_norm": 4.214382171630859, + "kl": 1.5227914098650217, + "learning_rate": 5.076156876931225e-07, + "loss": 0.1523, + "num_tokens": 21624953.0, + "reward": 0.76739501953125, + "reward_std": 0.009278219193220139, + "rewards//mean": 0.76739501953125, + "rewards//std": 0.025656845420598984, + "step": 2502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5006, + "grad_norm": 9.166537284851074, + "kl": 2.9747222997248173, + "learning_rate": 5.072983904012429e-07, + "loss": 0.2975, + "num_tokens": 21633585.0, + "reward": 0.7298583984375, + "reward_std": 0.016209067776799202, + "rewards//mean": 0.7298583984375, + "rewards//std": 0.03874010592699051, + "step": 2503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5008, + "grad_norm": 3.9825174808502197, + "kl": 0.7611479051411152, + "learning_rate": 5.069810901695727e-07, + "loss": 0.0761, + "num_tokens": 21642193.0, + "reward": 0.76116943359375, + "reward_std": 0.005227300338447094, + "rewards//mean": 0.76116943359375, + "rewards//std": 0.030606811866164207, + "step": 2504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.501, + "grad_norm": 3.5896291732788086, + "kl": 1.6619261093437672, + "learning_rate": 5.0666378712592e-07, + "loss": 0.1662, + "num_tokens": 21650761.0, + "reward": 0.75909423828125, + "reward_std": 0.009216178208589554, + "rewards//mean": 0.75909423828125, + "rewards//std": 0.030758274719119072, + "step": 2505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5012, + "grad_norm": 5.794517517089844, + "kl": 1.7589306011795998, + "learning_rate": 5.063464813980948e-07, + "loss": 0.1759, + "num_tokens": 21659409.0, + "reward": 0.7840576171875, + "reward_std": 0.012140953913331032, + "rewards//mean": 0.7840576171875, + "rewards//std": 0.03490019962191582, + "step": 2506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5014, + "grad_norm": 10.752869606018066, + "kl": 1.1706451326608658, + "learning_rate": 5.060291731139076e-07, + "loss": 0.1171, + "num_tokens": 21668041.0, + "reward": 0.77349853515625, + "reward_std": 0.00513218529522419, + "rewards//mean": 0.77349853515625, + "rewards//std": 0.03743181377649307, + "step": 2507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5016, + "grad_norm": 6.370254039764404, + "kl": 1.7088481038808823, + "learning_rate": 5.057118624011702e-07, + "loss": 0.1709, + "num_tokens": 21676641.0, + "reward": 0.78173828125, + "reward_std": 0.012616826221346855, + "rewards//mean": 0.78173828125, + "rewards//std": 0.03624846786260605, + "step": 2508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5018, + "grad_norm": 8.031913757324219, + "kl": 1.7459226585924625, + "learning_rate": 5.053945493876952e-07, + "loss": 0.1746, + "num_tokens": 21685281.0, + "reward": 0.74932861328125, + "reward_std": 0.014033197425305843, + "rewards//mean": 0.74932861328125, + "rewards//std": 0.03294447064399719, + "step": 2509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.502, + "grad_norm": 9.29623794555664, + "kl": 2.177991133183241, + "learning_rate": 5.050772342012966e-07, + "loss": 0.2178, + "num_tokens": 21693921.0, + "reward": 0.74853515625, + "reward_std": 0.007827691733837128, + "rewards//mean": 0.74853515625, + "rewards//std": 0.022493600845336914, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5022, + "grad_norm": 6.148255825042725, + "kl": 2.596444422379136, + "learning_rate": 5.047599169697883e-07, + "loss": 0.2596, + "num_tokens": 21702657.0, + "reward": 0.7349853515625, + "reward_std": 0.015206518582999706, + "rewards//mean": 0.7349853515625, + "rewards//std": 0.043012239038944244, + "step": 2511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5024, + "grad_norm": 3.8137447834014893, + "kl": 1.1198139805346727, + "learning_rate": 5.044425978209863e-07, + "loss": 0.112, + "num_tokens": 21711265.0, + "reward": 0.78125, + "reward_std": 0.01210021786391735, + "rewards//mean": 0.78125, + "rewards//std": 0.027943916618824005, + "step": 2512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5026, + "grad_norm": 8.862525939941406, + "kl": 2.2017168663442135, + "learning_rate": 5.041252768827063e-07, + "loss": 0.2202, + "num_tokens": 21719913.0, + "reward": 0.77227783203125, + "reward_std": 0.014344670809805393, + "rewards//mean": 0.77227783203125, + "rewards//std": 0.035520732402801514, + "step": 2513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5028, + "grad_norm": 2.6638264656066895, + "kl": 1.2830819711089134, + "learning_rate": 5.038079542827653e-07, + "loss": 0.1283, + "num_tokens": 21728609.0, + "reward": 0.7322998046875, + "reward_std": 0.008098629303276539, + "rewards//mean": 0.7322998046875, + "rewards//std": 0.025305526331067085, + "step": 2514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.503, + "grad_norm": 10.608713150024414, + "kl": 2.336262756958604, + "learning_rate": 5.034906301489807e-07, + "loss": 0.2336, + "num_tokens": 21737233.0, + "reward": 0.76214599609375, + "reward_std": 0.013304581865668297, + "rewards//mean": 0.76214599609375, + "rewards//std": 0.02846021205186844, + "step": 2515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5032, + "grad_norm": 14.8688325881958, + "kl": 2.644262967631221, + "learning_rate": 5.03173304609171e-07, + "loss": 0.2644, + "num_tokens": 21745913.0, + "reward": 0.7767333984375, + "reward_std": 0.017812861129641533, + "rewards//mean": 0.7767333984375, + "rewards//std": 0.027038391679525375, + "step": 2516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5034, + "grad_norm": 9.634334564208984, + "kl": 1.6725591868162155, + "learning_rate": 5.028559777911541e-07, + "loss": 0.1673, + "num_tokens": 21754569.0, + "reward": 0.7752685546875, + "reward_std": 0.00907925982028246, + "rewards//mean": 0.7752685546875, + "rewards//std": 0.03519390523433685, + "step": 2517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5036, + "grad_norm": 23.61309242248535, + "kl": 2.593292750418186, + "learning_rate": 5.025386498227501e-07, + "loss": 0.2593, + "num_tokens": 21763233.0, + "reward": 0.73980712890625, + "reward_std": 0.012684599496424198, + "rewards//mean": 0.73980712890625, + "rewards//std": 0.03931353613734245, + "step": 2518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5038, + "grad_norm": 2.70405912399292, + "kl": 1.8524313531816006, + "learning_rate": 5.022213208317781e-07, + "loss": 0.1852, + "num_tokens": 21771929.0, + "reward": 0.7574462890625, + "reward_std": 0.012638472020626068, + "rewards//mean": 0.7574462890625, + "rewards//std": 0.027976131066679955, + "step": 2519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.504, + "grad_norm": 18.070302963256836, + "kl": 0.6661789119243622, + "learning_rate": 5.019039909460583e-07, + "loss": 0.0666, + "num_tokens": 21780473.0, + "reward": 0.79486083984375, + "reward_std": 0.006178667303174734, + "rewards//mean": 0.79486083984375, + "rewards//std": 0.024498475715517998, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5042, + "grad_norm": 30.70311164855957, + "kl": 3.076902646571398, + "learning_rate": 5.015866602934111e-07, + "loss": 0.3077, + "num_tokens": 21789169.0, + "reward": 0.75311279296875, + "reward_std": 0.017096389085054398, + "rewards//mean": 0.75311279296875, + "rewards//std": 0.04234806075692177, + "step": 2521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5044, + "grad_norm": 19.738916397094727, + "kl": 2.290887352079153, + "learning_rate": 5.012693290016575e-07, + "loss": 0.2291, + "num_tokens": 21797801.0, + "reward": 0.7427978515625, + "reward_std": 0.009415658190846443, + "rewards//mean": 0.7427978515625, + "rewards//std": 0.03145255148410797, + "step": 2522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5046, + "grad_norm": 12.086870193481445, + "kl": 2.8087721131742, + "learning_rate": 5.009519971986182e-07, + "loss": 0.2809, + "num_tokens": 21806441.0, + "reward": 0.74462890625, + "reward_std": 0.017057452350854874, + "rewards//mean": 0.74462890625, + "rewards//std": 0.04520472139120102, + "step": 2523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5048, + "grad_norm": 7.494409084320068, + "kl": 3.2985002156347036, + "learning_rate": 5.006346650121147e-07, + "loss": 0.3299, + "num_tokens": 21815097.0, + "reward": 0.73736572265625, + "reward_std": 0.016291610896587372, + "rewards//mean": 0.73736572265625, + "rewards//std": 0.034943707287311554, + "step": 2524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.505, + "grad_norm": 14.198507308959961, + "kl": 3.5438093543052673, + "learning_rate": 5.003173325699681e-07, + "loss": 0.3544, + "num_tokens": 21823777.0, + "reward": 0.77203369140625, + "reward_std": 0.016144435852766037, + "rewards//mean": 0.77203369140625, + "rewards//std": 0.03171442449092865, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5052, + "grad_norm": 14.286942481994629, + "kl": 2.3999201990664005, + "learning_rate": 5e-07, + "loss": 0.24, + "num_tokens": 21832409.0, + "reward": 0.7752685546875, + "reward_std": 0.00915016233921051, + "rewards//mean": 0.7752685546875, + "rewards//std": 0.0372987799346447, + "step": 2526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5054, + "grad_norm": 6.3075175285339355, + "kl": 2.3677035477012396, + "learning_rate": 4.996826674300319e-07, + "loss": 0.2368, + "num_tokens": 21840993.0, + "reward": 0.72332763671875, + "reward_std": 0.01639110967516899, + "rewards//mean": 0.72332763671875, + "rewards//std": 0.035852447152137756, + "step": 2527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5056, + "grad_norm": 6.574167251586914, + "kl": 1.8813174348324537, + "learning_rate": 4.993653349878853e-07, + "loss": 0.1881, + "num_tokens": 21849633.0, + "reward": 0.75726318359375, + "reward_std": 0.01830998994410038, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.03779482841491699, + "step": 2528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5058, + "grad_norm": 2.445411443710327, + "kl": 1.5679683908820152, + "learning_rate": 4.990480028013818e-07, + "loss": 0.1568, + "num_tokens": 21858265.0, + "reward": 0.743896484375, + "reward_std": 0.012818719260394573, + "rewards//mean": 0.743896484375, + "rewards//std": 0.03564458340406418, + "step": 2529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.506, + "grad_norm": 8.395454406738281, + "kl": 1.0607334673404694, + "learning_rate": 4.987306709983425e-07, + "loss": 0.1061, + "num_tokens": 21866865.0, + "reward": 0.7783203125, + "reward_std": 0.006326441653072834, + "rewards//mean": 0.7783203125, + "rewards//std": 0.0255344957113266, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5062, + "grad_norm": 9.078377723693848, + "kl": 1.5781043153256178, + "learning_rate": 4.984133397065888e-07, + "loss": 0.1578, + "num_tokens": 21875481.0, + "reward": 0.75421142578125, + "reward_std": 0.009929932653903961, + "rewards//mean": 0.75421142578125, + "rewards//std": 0.029051383957266808, + "step": 2531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5064, + "grad_norm": 7.317929744720459, + "kl": 1.7005917839705944, + "learning_rate": 4.980960090539417e-07, + "loss": 0.1701, + "num_tokens": 21884057.0, + "reward": 0.75994873046875, + "reward_std": 0.008467942476272583, + "rewards//mean": 0.75994873046875, + "rewards//std": 0.03230883553624153, + "step": 2532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5066, + "grad_norm": 6.952382564544678, + "kl": 1.9908115305006504, + "learning_rate": 4.97778679168222e-07, + "loss": 0.1991, + "num_tokens": 21892705.0, + "reward": 0.7215576171875, + "reward_std": 0.01641327142715454, + "rewards//mean": 0.7215576171875, + "rewards//std": 0.04015986621379852, + "step": 2533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5068, + "grad_norm": 4.731269359588623, + "kl": 1.790529254823923, + "learning_rate": 4.9746135017725e-07, + "loss": 0.1791, + "num_tokens": 21901281.0, + "reward": 0.739501953125, + "reward_std": 0.014151472598314285, + "rewards//mean": 0.739501953125, + "rewards//std": 0.030725345015525818, + "step": 2534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.507, + "grad_norm": 8.864885330200195, + "kl": 1.8859914857894182, + "learning_rate": 4.971440222088458e-07, + "loss": 0.1886, + "num_tokens": 21910057.0, + "reward": 0.78875732421875, + "reward_std": 0.012724282220005989, + "rewards//mean": 0.78875732421875, + "rewards//std": 0.028084881603717804, + "step": 2535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5072, + "grad_norm": 6.596560001373291, + "kl": 1.4180640075355768, + "learning_rate": 4.968266953908291e-07, + "loss": 0.1418, + "num_tokens": 21918753.0, + "reward": 0.7681884765625, + "reward_std": 0.010321651585400105, + "rewards//mean": 0.7681884765625, + "rewards//std": 0.03502316400408745, + "step": 2536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5074, + "grad_norm": 6.714897155761719, + "kl": 1.5507013704627752, + "learning_rate": 4.965093698510192e-07, + "loss": 0.1551, + "num_tokens": 21927433.0, + "reward": 0.73883056640625, + "reward_std": 0.009078150615096092, + "rewards//mean": 0.73883056640625, + "rewards//std": 0.03332684561610222, + "step": 2537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5076, + "grad_norm": 6.680278778076172, + "kl": 1.0674237851053476, + "learning_rate": 4.961920457172346e-07, + "loss": 0.1067, + "num_tokens": 21936089.0, + "reward": 0.77398681640625, + "reward_std": 0.007194648962467909, + "rewards//mean": 0.77398681640625, + "rewards//std": 0.01933641918003559, + "step": 2538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5078, + "grad_norm": 7.657058238983154, + "kl": 1.8001793287694454, + "learning_rate": 4.958747231172937e-07, + "loss": 0.18, + "num_tokens": 21944673.0, + "reward": 0.7481689453125, + "reward_std": 0.011845672503113747, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.034917544573545456, + "step": 2539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.508, + "grad_norm": 11.799047470092773, + "kl": 1.5087166856974363, + "learning_rate": 4.955574021790137e-07, + "loss": 0.1509, + "num_tokens": 21953305.0, + "reward": 0.73870849609375, + "reward_std": 0.012504545040428638, + "rewards//mean": 0.73870849609375, + "rewards//std": 0.033451974391937256, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5082, + "grad_norm": 10.218378067016602, + "kl": 1.0806394089013338, + "learning_rate": 4.952400830302116e-07, + "loss": 0.1081, + "num_tokens": 21961873.0, + "reward": 0.7578125, + "reward_std": 0.007291832007467747, + "rewards//mean": 0.7578125, + "rewards//std": 0.02473468892276287, + "step": 2541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5084, + "grad_norm": 7.828004837036133, + "kl": 2.0790257044136524, + "learning_rate": 4.949227657987035e-07, + "loss": 0.2079, + "num_tokens": 21970617.0, + "reward": 0.78472900390625, + "reward_std": 0.02458987943828106, + "rewards//mean": 0.78472900390625, + "rewards//std": 0.040957964956760406, + "step": 2542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5086, + "grad_norm": 5.3359761238098145, + "kl": 1.8180590346455574, + "learning_rate": 4.946054506123048e-07, + "loss": 0.1818, + "num_tokens": 21979313.0, + "reward": 0.763427734375, + "reward_std": 0.015840966254472733, + "rewards//mean": 0.763427734375, + "rewards//std": 0.029461754485964775, + "step": 2543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5088, + "grad_norm": 4.489680767059326, + "kl": 1.0474360268563032, + "learning_rate": 4.942881375988299e-07, + "loss": 0.1047, + "num_tokens": 21987913.0, + "reward": 0.7430419921875, + "reward_std": 0.012303894385695457, + "rewards//mean": 0.7430419921875, + "rewards//std": 0.027254750952124596, + "step": 2544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.509, + "grad_norm": 5.48600435256958, + "kl": 0.7937683835625648, + "learning_rate": 4.939708268860924e-07, + "loss": 0.0794, + "num_tokens": 21996473.0, + "reward": 0.7415771484375, + "reward_std": 0.0045309001579880714, + "rewards//mean": 0.7415771484375, + "rewards//std": 0.02060207910835743, + "step": 2545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5092, + "grad_norm": 3.4347469806671143, + "kl": 1.4726912286132574, + "learning_rate": 4.936535186019052e-07, + "loss": 0.1473, + "num_tokens": 22005193.0, + "reward": 0.7679443359375, + "reward_std": 0.011799246072769165, + "rewards//mean": 0.7679443359375, + "rewards//std": 0.03584679588675499, + "step": 2546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5094, + "grad_norm": 4.923105239868164, + "kl": 0.9657264780253172, + "learning_rate": 4.933362128740799e-07, + "loss": 0.0966, + "num_tokens": 22013809.0, + "reward": 0.76470947265625, + "reward_std": 0.008658932521939278, + "rewards//mean": 0.76470947265625, + "rewards//std": 0.02222541905939579, + "step": 2547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5096, + "grad_norm": 2.5681509971618652, + "kl": 0.7202414702624083, + "learning_rate": 4.930189098304274e-07, + "loss": 0.072, + "num_tokens": 22022361.0, + "reward": 0.75286865234375, + "reward_std": 0.007256942335516214, + "rewards//mean": 0.75286865234375, + "rewards//std": 0.026284601539373398, + "step": 2548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5098, + "grad_norm": 6.287543296813965, + "kl": 0.9242824912071228, + "learning_rate": 4.92701609598757e-07, + "loss": 0.0924, + "num_tokens": 22030897.0, + "reward": 0.73968505859375, + "reward_std": 0.009582065977156162, + "rewards//mean": 0.73968505859375, + "rewards//std": 0.03696218132972717, + "step": 2549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.51, + "grad_norm": 6.290980815887451, + "kl": 0.9175565987825394, + "learning_rate": 4.923843123068775e-07, + "loss": 0.0918, + "num_tokens": 22039529.0, + "reward": 0.76275634765625, + "reward_std": 0.004656442906707525, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.02650483138859272, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5102, + "grad_norm": 7.5758466720581055, + "kl": 1.5644459370523691, + "learning_rate": 4.92067018082596e-07, + "loss": 0.1564, + "num_tokens": 22048161.0, + "reward": 0.74432373046875, + "reward_std": 0.009754505008459091, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.03040883131325245, + "step": 2551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5104, + "grad_norm": 4.6259284019470215, + "kl": 1.0289939064532518, + "learning_rate": 4.917497270537187e-07, + "loss": 0.1029, + "num_tokens": 22056761.0, + "reward": 0.766357421875, + "reward_std": 0.009532168507575989, + "rewards//mean": 0.766357421875, + "rewards//std": 0.03462433069944382, + "step": 2552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5106, + "grad_norm": 3.6556060314178467, + "kl": 0.9543606154620647, + "learning_rate": 4.914324393480503e-07, + "loss": 0.0954, + "num_tokens": 22065361.0, + "reward": 0.78985595703125, + "reward_std": 0.00709438044577837, + "rewards//mean": 0.78985595703125, + "rewards//std": 0.023106331005692482, + "step": 2553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5108, + "grad_norm": 5.405599117279053, + "kl": 1.2493596263229847, + "learning_rate": 4.911151550933945e-07, + "loss": 0.1249, + "num_tokens": 22073897.0, + "reward": 0.73199462890625, + "reward_std": 0.007694587577134371, + "rewards//mean": 0.73199462890625, + "rewards//std": 0.03233646973967552, + "step": 2554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.511, + "grad_norm": 3.856583595275879, + "kl": 0.993172038346529, + "learning_rate": 4.90797874417553e-07, + "loss": 0.0993, + "num_tokens": 22082577.0, + "reward": 0.72845458984375, + "reward_std": 0.005823909305036068, + "rewards//mean": 0.72845458984375, + "rewards//std": 0.03150466829538345, + "step": 2555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5112, + "grad_norm": 4.748661041259766, + "kl": 0.7111566811800003, + "learning_rate": 4.904805974483266e-07, + "loss": 0.0711, + "num_tokens": 22091313.0, + "reward": 0.77716064453125, + "reward_std": 0.005588800646364689, + "rewards//mean": 0.77716064453125, + "rewards//std": 0.026267895475029945, + "step": 2556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 127.625, + "epoch": 0.5114, + "grad_norm": 5.557389259338379, + "kl": 1.1699679382145405, + "learning_rate": 4.901633243135143e-07, + "loss": 0.1105, + "num_tokens": 22099961.0, + "reward": 0.73870849609375, + "reward_std": 0.014138857834041119, + "rewards//mean": 0.73870849609375, + "rewards//std": 0.028289495036005974, + "step": 2557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5116, + "grad_norm": 6.33830451965332, + "kl": 1.0815546792000532, + "learning_rate": 4.89846055140914e-07, + "loss": 0.1082, + "num_tokens": 22108521.0, + "reward": 0.7506103515625, + "reward_std": 0.004612094722688198, + "rewards//mean": 0.7506103515625, + "rewards//std": 0.041047170758247375, + "step": 2558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5118, + "grad_norm": 5.517125129699707, + "kl": 1.0506632905453444, + "learning_rate": 4.895287900583216e-07, + "loss": 0.1051, + "num_tokens": 22117129.0, + "reward": 0.77960205078125, + "reward_std": 0.00885198637843132, + "rewards//mean": 0.77960205078125, + "rewards//std": 0.020145412534475327, + "step": 2559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.512, + "grad_norm": 3.487072706222534, + "kl": 1.1352508570998907, + "learning_rate": 4.892115291935309e-07, + "loss": 0.1135, + "num_tokens": 22125865.0, + "reward": 0.75054931640625, + "reward_std": 0.0037888444494456053, + "rewards//mean": 0.75054931640625, + "rewards//std": 0.03589506447315216, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5122, + "grad_norm": 2.4822616577148438, + "kl": 1.00700943171978, + "learning_rate": 4.888942726743353e-07, + "loss": 0.1007, + "num_tokens": 22134497.0, + "reward": 0.76702880859375, + "reward_std": 0.0077543314546346664, + "rewards//mean": 0.76702880859375, + "rewards//std": 0.023139717057347298, + "step": 2561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5124, + "grad_norm": 3.9019739627838135, + "kl": 0.9671955890953541, + "learning_rate": 4.885770206285252e-07, + "loss": 0.0967, + "num_tokens": 22143073.0, + "reward": 0.73651123046875, + "reward_std": 0.0056818085722625256, + "rewards//mean": 0.73651123046875, + "rewards//std": 0.022998636588454247, + "step": 2562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5126, + "grad_norm": 4.723089694976807, + "kl": 0.9810960814356804, + "learning_rate": 4.882597731838898e-07, + "loss": 0.0981, + "num_tokens": 22151721.0, + "reward": 0.73272705078125, + "reward_std": 0.004166848491877317, + "rewards//mean": 0.73272705078125, + "rewards//std": 0.04137386009097099, + "step": 2563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5128, + "grad_norm": 1.6453090906143188, + "kl": 0.5584238544106483, + "learning_rate": 4.879425304682163e-07, + "loss": 0.0558, + "num_tokens": 22160289.0, + "reward": 0.7440185546875, + "reward_std": 0.0017263349145650864, + "rewards//mean": 0.7440185546875, + "rewards//std": 0.03187897056341171, + "step": 2564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.513, + "grad_norm": 2.8847897052764893, + "kl": 0.5963284056633711, + "learning_rate": 4.876252926092902e-07, + "loss": 0.0596, + "num_tokens": 22168905.0, + "reward": 0.7298583984375, + "reward_std": 0.00178057630546391, + "rewards//mean": 0.7298583984375, + "rewards//std": 0.03050655499100685, + "step": 2565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5132, + "grad_norm": 3.6680257320404053, + "kl": 1.255133956670761, + "learning_rate": 4.873080597348947e-07, + "loss": 0.1255, + "num_tokens": 22177601.0, + "reward": 0.76611328125, + "reward_std": 0.008387265726923943, + "rewards//mean": 0.76611328125, + "rewards//std": 0.026139140129089355, + "step": 2566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5134, + "grad_norm": 2.4625751972198486, + "kl": 1.0458602719008923, + "learning_rate": 4.869908319728113e-07, + "loss": 0.1046, + "num_tokens": 22186153.0, + "reward": 0.76336669921875, + "reward_std": 0.007889105938374996, + "rewards//mean": 0.76336669921875, + "rewards//std": 0.026782656088471413, + "step": 2567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5136, + "grad_norm": 3.9797255992889404, + "kl": 1.1420323513448238, + "learning_rate": 4.866736094508191e-07, + "loss": 0.1142, + "num_tokens": 22194857.0, + "reward": 0.7445068359375, + "reward_std": 0.007372075691819191, + "rewards//mean": 0.7445068359375, + "rewards//std": 0.0204190444201231, + "step": 2568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5138, + "grad_norm": 5.16852331161499, + "kl": 2.139840357005596, + "learning_rate": 4.863563922966956e-07, + "loss": 0.214, + "num_tokens": 22203505.0, + "reward": 0.744384765625, + "reward_std": 0.016666950657963753, + "rewards//mean": 0.744384765625, + "rewards//std": 0.037711478769779205, + "step": 2569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.514, + "grad_norm": 5.785426616668701, + "kl": 1.2060081604868174, + "learning_rate": 4.860391806382156e-07, + "loss": 0.1206, + "num_tokens": 22212185.0, + "reward": 0.7569580078125, + "reward_std": 0.01335352286696434, + "rewards//mean": 0.7569580078125, + "rewards//std": 0.03941107541322708, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5142, + "grad_norm": 5.690269470214844, + "kl": 1.0841991622000933, + "learning_rate": 4.857219746031519e-07, + "loss": 0.1084, + "num_tokens": 22220769.0, + "reward": 0.77423095703125, + "reward_std": 0.01410963200032711, + "rewards//mean": 0.77423095703125, + "rewards//std": 0.028477758169174194, + "step": 2571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5144, + "grad_norm": 3.854118824005127, + "kl": 1.5908323153853416, + "learning_rate": 4.854047743192752e-07, + "loss": 0.1591, + "num_tokens": 22229425.0, + "reward": 0.7589111328125, + "reward_std": 0.01034512184560299, + "rewards//mean": 0.7589111328125, + "rewards//std": 0.022611400112509727, + "step": 2572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5146, + "grad_norm": 2.3943686485290527, + "kl": 1.7506914753466845, + "learning_rate": 4.850875799143536e-07, + "loss": 0.1751, + "num_tokens": 22238001.0, + "reward": 0.755615234375, + "reward_std": 0.013219613581895828, + "rewards//mean": 0.755615234375, + "rewards//std": 0.032037414610385895, + "step": 2573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5148, + "grad_norm": 4.672167778015137, + "kl": 1.1741195898503065, + "learning_rate": 4.84770391516153e-07, + "loss": 0.1174, + "num_tokens": 22246697.0, + "reward": 0.7518310546875, + "reward_std": 0.008454401046037674, + "rewards//mean": 0.7518310546875, + "rewards//std": 0.0391242615878582, + "step": 2574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.515, + "grad_norm": 3.1410436630249023, + "kl": 1.1982522066682577, + "learning_rate": 4.84453209252437e-07, + "loss": 0.1198, + "num_tokens": 22255345.0, + "reward": 0.783935546875, + "reward_std": 0.011136407032608986, + "rewards//mean": 0.783935546875, + "rewards//std": 0.03699170798063278, + "step": 2575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5152, + "grad_norm": 5.193376541137695, + "kl": 1.8655571769922972, + "learning_rate": 4.841360332509662e-07, + "loss": 0.1866, + "num_tokens": 22264033.0, + "reward": 0.76898193359375, + "reward_std": 0.017385311424732208, + "rewards//mean": 0.76898193359375, + "rewards//std": 0.034708116203546524, + "step": 2576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5154, + "grad_norm": 4.9763689041137695, + "kl": 2.071575351059437, + "learning_rate": 4.838188636394996e-07, + "loss": 0.2072, + "num_tokens": 22272681.0, + "reward": 0.75347900390625, + "reward_std": 0.011817192658782005, + "rewards//mean": 0.75347900390625, + "rewards//std": 0.037260763347148895, + "step": 2577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5156, + "grad_norm": 6.402245044708252, + "kl": 1.0930734928697348, + "learning_rate": 4.835017005457925e-07, + "loss": 0.1093, + "num_tokens": 22281369.0, + "reward": 0.7833251953125, + "reward_std": 0.005407290067523718, + "rewards//mean": 0.7833251953125, + "rewards//std": 0.02952001616358757, + "step": 2578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5158, + "grad_norm": 11.510376930236816, + "kl": 2.2528360430151224, + "learning_rate": 4.831845440975987e-07, + "loss": 0.2253, + "num_tokens": 22290145.0, + "reward": 0.75323486328125, + "reward_std": 0.009901678189635277, + "rewards//mean": 0.75323486328125, + "rewards//std": 0.04322756826877594, + "step": 2579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.516, + "grad_norm": 3.9573092460632324, + "kl": 0.9219828229397535, + "learning_rate": 4.828673944226683e-07, + "loss": 0.0922, + "num_tokens": 22298793.0, + "reward": 0.75616455078125, + "reward_std": 0.00584527337923646, + "rewards//mean": 0.75616455078125, + "rewards//std": 0.03066955879330635, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5162, + "grad_norm": 2.6585640907287598, + "kl": 1.039936289191246, + "learning_rate": 4.825502516487496e-07, + "loss": 0.104, + "num_tokens": 22307361.0, + "reward": 0.74395751953125, + "reward_std": 0.008076553232967854, + "rewards//mean": 0.74395751953125, + "rewards//std": 0.017146440222859383, + "step": 2581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5164, + "grad_norm": 1.7786988019943237, + "kl": 1.3825499173253775, + "learning_rate": 4.822331159035873e-07, + "loss": 0.1383, + "num_tokens": 22315985.0, + "reward": 0.7674560546875, + "reward_std": 0.009316005744040012, + "rewards//mean": 0.7674560546875, + "rewards//std": 0.023785943165421486, + "step": 2582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5166, + "grad_norm": 3.562347650527954, + "kl": 2.1416558995842934, + "learning_rate": 4.819159873149239e-07, + "loss": 0.2142, + "num_tokens": 22324657.0, + "reward": 0.7861328125, + "reward_std": 0.017463773488998413, + "rewards//mean": 0.7861328125, + "rewards//std": 0.02632840722799301, + "step": 2583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5168, + "grad_norm": 4.245169162750244, + "kl": 1.5955359302461147, + "learning_rate": 4.815988660104985e-07, + "loss": 0.1596, + "num_tokens": 22333161.0, + "reward": 0.7283935546875, + "reward_std": 0.009126987308263779, + "rewards//mean": 0.7283935546875, + "rewards//std": 0.03562483191490173, + "step": 2584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.517, + "grad_norm": 1.2504308223724365, + "kl": 0.6896695047616959, + "learning_rate": 4.812817521180478e-07, + "loss": 0.069, + "num_tokens": 22341785.0, + "reward": 0.7208251953125, + "reward_std": 0.002043080283328891, + "rewards//mean": 0.7208251953125, + "rewards//std": 0.03720775991678238, + "step": 2585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5172, + "grad_norm": 1.4241734743118286, + "kl": 1.3490499667823315, + "learning_rate": 4.809646457653051e-07, + "loss": 0.1349, + "num_tokens": 22350449.0, + "reward": 0.76885986328125, + "reward_std": 0.009102554060518742, + "rewards//mean": 0.76885986328125, + "rewards//std": 0.0233500637114048, + "step": 2586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5174, + "grad_norm": 6.4338178634643555, + "kl": 1.3450198527425528, + "learning_rate": 4.806475470800008e-07, + "loss": 0.1345, + "num_tokens": 22359097.0, + "reward": 0.76275634765625, + "reward_std": 0.0052174776792526245, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.013191629201173782, + "step": 2587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5176, + "grad_norm": 6.421856880187988, + "kl": 2.106158208101988, + "learning_rate": 4.803304561898621e-07, + "loss": 0.2106, + "num_tokens": 22367817.0, + "reward": 0.75640869140625, + "reward_std": 0.011707558296620846, + "rewards//mean": 0.75640869140625, + "rewards//std": 0.04078759625554085, + "step": 2588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5178, + "grad_norm": 3.5428192615509033, + "kl": 1.3217595629394054, + "learning_rate": 4.800133732226135e-07, + "loss": 0.1322, + "num_tokens": 22376457.0, + "reward": 0.80047607421875, + "reward_std": 0.012906478717923164, + "rewards//mean": 0.80047607421875, + "rewards//std": 0.03589126840233803, + "step": 2589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.518, + "grad_norm": 3.2299907207489014, + "kl": 1.6418762244284153, + "learning_rate": 4.796962983059757e-07, + "loss": 0.1642, + "num_tokens": 22385097.0, + "reward": 0.7642822265625, + "reward_std": 0.012455468997359276, + "rewards//mean": 0.7642822265625, + "rewards//std": 0.03257230669260025, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5182, + "grad_norm": 3.004642963409424, + "kl": 2.093676568940282, + "learning_rate": 4.793792315676664e-07, + "loss": 0.2094, + "num_tokens": 22393801.0, + "reward": 0.780029296875, + "reward_std": 0.015235761180520058, + "rewards//mean": 0.780029296875, + "rewards//std": 0.03712895140051842, + "step": 2591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5184, + "grad_norm": 1.9571582078933716, + "kl": 2.0521460622549057, + "learning_rate": 4.790621731354002e-07, + "loss": 0.2052, + "num_tokens": 22402425.0, + "reward": 0.76361083984375, + "reward_std": 0.018446065485477448, + "rewards//mean": 0.76361083984375, + "rewards//std": 0.03531772643327713, + "step": 2592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5186, + "grad_norm": 2.337049722671509, + "kl": 1.8338576219975948, + "learning_rate": 4.787451231368882e-07, + "loss": 0.1834, + "num_tokens": 22411017.0, + "reward": 0.76275634765625, + "reward_std": 0.01030757650732994, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.036948252469301224, + "step": 2593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5188, + "grad_norm": 11.217828750610352, + "kl": 2.9629970006644726, + "learning_rate": 4.784280816998382e-07, + "loss": 0.2963, + "num_tokens": 22419641.0, + "reward": 0.735107421875, + "reward_std": 0.014385035261511803, + "rewards//mean": 0.735107421875, + "rewards//std": 0.039999157190322876, + "step": 2594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.519, + "grad_norm": 3.559687852859497, + "kl": 0.9134598076343536, + "learning_rate": 4.78111048951954e-07, + "loss": 0.0913, + "num_tokens": 22428273.0, + "reward": 0.7330322265625, + "reward_std": 0.004977358039468527, + "rewards//mean": 0.7330322265625, + "rewards//std": 0.026247501373291016, + "step": 2595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5192, + "grad_norm": 6.836112976074219, + "kl": 0.8729670122265816, + "learning_rate": 4.777940250209369e-07, + "loss": 0.0873, + "num_tokens": 22436889.0, + "reward": 0.779296875, + "reward_std": 0.007071391213685274, + "rewards//mean": 0.779296875, + "rewards//std": 0.0191492922604084, + "step": 2596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5194, + "grad_norm": 7.453264236450195, + "kl": 1.6207810938358307, + "learning_rate": 4.774770100344838e-07, + "loss": 0.1621, + "num_tokens": 22445473.0, + "reward": 0.74517822265625, + "reward_std": 0.008911101147532463, + "rewards//mean": 0.74517822265625, + "rewards//std": 0.033143747597932816, + "step": 2597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5196, + "grad_norm": 3.9046013355255127, + "kl": 1.5668298080563545, + "learning_rate": 4.771600041202883e-07, + "loss": 0.1567, + "num_tokens": 22454089.0, + "reward": 0.748291015625, + "reward_std": 0.013200517743825912, + "rewards//mean": 0.748291015625, + "rewards//std": 0.0323084257543087, + "step": 2598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5198, + "grad_norm": 3.394597291946411, + "kl": 1.434912158176303, + "learning_rate": 4.768430074060405e-07, + "loss": 0.1435, + "num_tokens": 22462721.0, + "reward": 0.74237060546875, + "reward_std": 0.006272242404520512, + "rewards//mean": 0.74237060546875, + "rewards//std": 0.03790082037448883, + "step": 2599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.52, + "grad_norm": 4.694181442260742, + "kl": 1.351081419736147, + "learning_rate": 4.7652602001942655e-07, + "loss": 0.1351, + "num_tokens": 22471305.0, + "reward": 0.77020263671875, + "reward_std": 0.012560486793518066, + "rewards//mean": 0.77020263671875, + "rewards//std": 0.030485397204756737, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5202, + "grad_norm": 1.800676703453064, + "kl": 1.815363923087716, + "learning_rate": 4.762090420881288e-07, + "loss": 0.1815, + "num_tokens": 22479905.0, + "reward": 0.7115478515625, + "reward_std": 0.01158073078840971, + "rewards//mean": 0.7115478515625, + "rewards//std": 0.02465837635099888, + "step": 2601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5204, + "grad_norm": 11.646526336669922, + "kl": 2.6421290785074234, + "learning_rate": 4.758920737398263e-07, + "loss": 0.2642, + "num_tokens": 22488497.0, + "reward": 0.72314453125, + "reward_std": 0.018994122743606567, + "rewards//mean": 0.72314453125, + "rewards//std": 0.05012356489896774, + "step": 2602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5206, + "grad_norm": 3.5305421352386475, + "kl": 1.7523624170571566, + "learning_rate": 4.7557511510219335e-07, + "loss": 0.1752, + "num_tokens": 22497049.0, + "reward": 0.738525390625, + "reward_std": 0.0117521733045578, + "rewards//mean": 0.738525390625, + "rewards//std": 0.028158696368336678, + "step": 2603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5208, + "grad_norm": 16.25846290588379, + "kl": 1.7256456427276134, + "learning_rate": 4.7525816630290126e-07, + "loss": 0.1726, + "num_tokens": 22505785.0, + "reward": 0.74700927734375, + "reward_std": 0.012896544300019741, + "rewards//mean": 0.74700927734375, + "rewards//std": 0.03693842142820358, + "step": 2604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.521, + "grad_norm": 2.928736448287964, + "kl": 1.3061204347759485, + "learning_rate": 4.7494122746961687e-07, + "loss": 0.1306, + "num_tokens": 22514401.0, + "reward": 0.77484130859375, + "reward_std": 0.010342583991587162, + "rewards//mean": 0.77484130859375, + "rewards//std": 0.035932157188653946, + "step": 2605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5212, + "grad_norm": 21.40410804748535, + "kl": 3.867992267012596, + "learning_rate": 4.7462429873000293e-07, + "loss": 0.3868, + "num_tokens": 22523065.0, + "reward": 0.7352294921875, + "reward_std": 0.01248180028051138, + "rewards//mean": 0.7352294921875, + "rewards//std": 0.033112503588199615, + "step": 2606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5214, + "grad_norm": 3.086261510848999, + "kl": 1.610474530607462, + "learning_rate": 4.743073802117185e-07, + "loss": 0.161, + "num_tokens": 22531681.0, + "reward": 0.723388671875, + "reward_std": 0.007806425914168358, + "rewards//mean": 0.723388671875, + "rewards//std": 0.04156700149178505, + "step": 2607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5216, + "grad_norm": 2.9651544094085693, + "kl": 1.389178205281496, + "learning_rate": 4.7399047204241823e-07, + "loss": 0.1389, + "num_tokens": 22540329.0, + "reward": 0.7442626953125, + "reward_std": 0.006399606820195913, + "rewards//mean": 0.7442626953125, + "rewards//std": 0.0262636449187994, + "step": 2608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5218, + "grad_norm": 11.435442924499512, + "kl": 2.9151642210781574, + "learning_rate": 4.7367357434975274e-07, + "loss": 0.2915, + "num_tokens": 22549041.0, + "reward": 0.73162841796875, + "reward_std": 0.007305110804736614, + "rewards//mean": 0.73162841796875, + "rewards//std": 0.03398754447698593, + "step": 2609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.522, + "grad_norm": 2.70566463470459, + "kl": 1.5165428575128317, + "learning_rate": 4.733566872613682e-07, + "loss": 0.1517, + "num_tokens": 22557625.0, + "reward": 0.7864990234375, + "reward_std": 0.009786528535187244, + "rewards//mean": 0.7864990234375, + "rewards//std": 0.02867303602397442, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5222, + "grad_norm": 2.747001886367798, + "kl": 0.7808938194066286, + "learning_rate": 4.7303981090490706e-07, + "loss": 0.0781, + "num_tokens": 22566193.0, + "reward": 0.72149658203125, + "reward_std": 0.003450260031968355, + "rewards//mean": 0.72149658203125, + "rewards//std": 0.02967313677072525, + "step": 2611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5224, + "grad_norm": 5.174720764160156, + "kl": 1.3340777661651373, + "learning_rate": 4.727229454080067e-07, + "loss": 0.1334, + "num_tokens": 22574833.0, + "reward": 0.77178955078125, + "reward_std": 0.004306231625378132, + "rewards//mean": 0.77178955078125, + "rewards//std": 0.028804391622543335, + "step": 2612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5226, + "grad_norm": 2.775101900100708, + "kl": 1.6023515835404396, + "learning_rate": 4.724060908983008e-07, + "loss": 0.1602, + "num_tokens": 22583401.0, + "reward": 0.759765625, + "reward_std": 0.009352784603834152, + "rewards//mean": 0.759765625, + "rewards//std": 0.02767392247915268, + "step": 2613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5228, + "grad_norm": 3.4158835411071777, + "kl": 1.3008998930454254, + "learning_rate": 4.7208924750341805e-07, + "loss": 0.1301, + "num_tokens": 22592009.0, + "reward": 0.73162841796875, + "reward_std": 0.011731077916920185, + "rewards//mean": 0.73162841796875, + "rewards//std": 0.03727741539478302, + "step": 2614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.523, + "grad_norm": 3.5716264247894287, + "kl": 0.7993995379656553, + "learning_rate": 4.717724153509832e-07, + "loss": 0.0799, + "num_tokens": 22600633.0, + "reward": 0.72589111328125, + "reward_std": 0.005505294539034367, + "rewards//mean": 0.72589111328125, + "rewards//std": 0.02954942174255848, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5232, + "grad_norm": 9.18824291229248, + "kl": 1.3568630404770374, + "learning_rate": 4.7145559456861594e-07, + "loss": 0.1357, + "num_tokens": 22609249.0, + "reward": 0.766845703125, + "reward_std": 0.009566757827997208, + "rewards//mean": 0.766845703125, + "rewards//std": 0.02546682395040989, + "step": 2616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5234, + "grad_norm": 7.710018634796143, + "kl": 2.2801895029842854, + "learning_rate": 4.711387852839319e-07, + "loss": 0.228, + "num_tokens": 22617889.0, + "reward": 0.7828369140625, + "reward_std": 0.014238116331398487, + "rewards//mean": 0.7828369140625, + "rewards//std": 0.03574530407786369, + "step": 2617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5236, + "grad_norm": 2.9775192737579346, + "kl": 1.2666481956839561, + "learning_rate": 4.708219876245416e-07, + "loss": 0.1267, + "num_tokens": 22626401.0, + "reward": 0.759521484375, + "reward_std": 0.011624859645962715, + "rewards//mean": 0.759521484375, + "rewards//std": 0.036103684455156326, + "step": 2618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5238, + "grad_norm": 2.2733800411224365, + "kl": 1.5849352553486824, + "learning_rate": 4.7050520171805133e-07, + "loss": 0.1585, + "num_tokens": 22635025.0, + "reward": 0.75579833984375, + "reward_std": 0.009671460837125778, + "rewards//mean": 0.75579833984375, + "rewards//std": 0.03142769634723663, + "step": 2619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.524, + "grad_norm": 2.3432579040527344, + "kl": 1.054134426638484, + "learning_rate": 4.7018842769206214e-07, + "loss": 0.1054, + "num_tokens": 22643673.0, + "reward": 0.75958251953125, + "reward_std": 0.004661104176193476, + "rewards//mean": 0.75958251953125, + "rewards//std": 0.036636799573898315, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5242, + "grad_norm": 4.114675521850586, + "kl": 1.5802659764885902, + "learning_rate": 4.698716656741708e-07, + "loss": 0.158, + "num_tokens": 22652305.0, + "reward": 0.7568359375, + "reward_std": 0.008275945670902729, + "rewards//mean": 0.7568359375, + "rewards//std": 0.02796124666929245, + "step": 2621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5244, + "grad_norm": 1.5225121974945068, + "kl": 0.599779887124896, + "learning_rate": 4.6955491579196893e-07, + "loss": 0.06, + "num_tokens": 22660873.0, + "reward": 0.794921875, + "reward_std": 0.0022851889953017235, + "rewards//mean": 0.794921875, + "rewards//std": 0.032881516963243484, + "step": 2622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5246, + "grad_norm": 3.964243173599243, + "kl": 1.3088293429464102, + "learning_rate": 4.692381781730432e-07, + "loss": 0.1309, + "num_tokens": 22669521.0, + "reward": 0.72979736328125, + "reward_std": 0.009758850559592247, + "rewards//mean": 0.72979736328125, + "rewards//std": 0.030667586252093315, + "step": 2623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5248, + "grad_norm": 6.153960704803467, + "kl": 1.8801350500434637, + "learning_rate": 4.6892145294497576e-07, + "loss": 0.188, + "num_tokens": 22678185.0, + "reward": 0.75714111328125, + "reward_std": 0.008084646426141262, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.03122328594326973, + "step": 2624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.525, + "grad_norm": 0.6833559274673462, + "kl": 0.8053571712225676, + "learning_rate": 4.686047402353433e-07, + "loss": 0.0805, + "num_tokens": 22686825.0, + "reward": 0.78192138671875, + "reward_std": 0.004088305402547121, + "rewards//mean": 0.78192138671875, + "rewards//std": 0.029459122568368912, + "step": 2625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5252, + "grad_norm": 4.220037460327148, + "kl": 1.6383055709302425, + "learning_rate": 4.682880401717177e-07, + "loss": 0.1638, + "num_tokens": 22695449.0, + "reward": 0.7381591796875, + "reward_std": 0.01257932186126709, + "rewards//mean": 0.7381591796875, + "rewards//std": 0.034862007945775986, + "step": 2626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5254, + "grad_norm": 2.543783187866211, + "kl": 1.5210609138011932, + "learning_rate": 4.679713528816658e-07, + "loss": 0.1521, + "num_tokens": 22704025.0, + "reward": 0.73193359375, + "reward_std": 0.011085865087807178, + "rewards//mean": 0.73193359375, + "rewards//std": 0.03842170536518097, + "step": 2627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5256, + "grad_norm": 7.542661190032959, + "kl": 2.118085864931345, + "learning_rate": 4.676546784927491e-07, + "loss": 0.2118, + "num_tokens": 22712729.0, + "reward": 0.73614501953125, + "reward_std": 0.010074400343000889, + "rewards//mean": 0.73614501953125, + "rewards//std": 0.03970777615904808, + "step": 2628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5258, + "grad_norm": 2.5040130615234375, + "kl": 0.9424903355538845, + "learning_rate": 4.67338017132524e-07, + "loss": 0.0942, + "num_tokens": 22721369.0, + "reward": 0.7318115234375, + "reward_std": 0.007101117633283138, + "rewards//mean": 0.7318115234375, + "rewards//std": 0.03324936702847481, + "step": 2629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.526, + "grad_norm": 2.357954263687134, + "kl": 1.6269660405814648, + "learning_rate": 4.670213689285417e-07, + "loss": 0.1627, + "num_tokens": 22729969.0, + "reward": 0.7545166015625, + "reward_std": 0.00959410984069109, + "rewards//mean": 0.7545166015625, + "rewards//std": 0.03799520805478096, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5262, + "grad_norm": 9.207202911376953, + "kl": 2.334194468334317, + "learning_rate": 4.66704734008348e-07, + "loss": 0.2334, + "num_tokens": 22738633.0, + "reward": 0.75128173828125, + "reward_std": 0.013249401934444904, + "rewards//mean": 0.75128173828125, + "rewards//std": 0.03492507338523865, + "step": 2631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5264, + "grad_norm": 4.387116432189941, + "kl": 1.2691538594663143, + "learning_rate": 4.6638811249948365e-07, + "loss": 0.1269, + "num_tokens": 22747257.0, + "reward": 0.74334716796875, + "reward_std": 0.004905918147414923, + "rewards//mean": 0.74334716796875, + "rewards//std": 0.02725662663578987, + "step": 2632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5266, + "grad_norm": 2.672348976135254, + "kl": 0.8763229176402092, + "learning_rate": 4.6607150452948336e-07, + "loss": 0.0876, + "num_tokens": 22755945.0, + "reward": 0.7586669921875, + "reward_std": 0.006133291870355606, + "rewards//mean": 0.7586669921875, + "rewards//std": 0.02395336702466011, + "step": 2633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5268, + "grad_norm": 1.53461754322052, + "kl": 1.339928386732936, + "learning_rate": 4.657549102258771e-07, + "loss": 0.134, + "num_tokens": 22764585.0, + "reward": 0.78314208984375, + "reward_std": 0.010806762613356113, + "rewards//mean": 0.78314208984375, + "rewards//std": 0.027410034090280533, + "step": 2634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.527, + "grad_norm": 5.34895133972168, + "kl": 1.7136068418622017, + "learning_rate": 4.6543832971618885e-07, + "loss": 0.1714, + "num_tokens": 22773217.0, + "reward": 0.7406005859375, + "reward_std": 0.008986150845885277, + "rewards//mean": 0.7406005859375, + "rewards//std": 0.03380203619599342, + "step": 2635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5272, + "grad_norm": 12.848258972167969, + "kl": 2.252772703766823, + "learning_rate": 4.6512176312793735e-07, + "loss": 0.2253, + "num_tokens": 22781945.0, + "reward": 0.72198486328125, + "reward_std": 0.015048781409859657, + "rewards//mean": 0.72198486328125, + "rewards//std": 0.0467381589114666, + "step": 2636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5274, + "grad_norm": 1.8560667037963867, + "kl": 1.426280502229929, + "learning_rate": 4.648052105886354e-07, + "loss": 0.1426, + "num_tokens": 22790601.0, + "reward": 0.72357177734375, + "reward_std": 0.0067107887007296085, + "rewards//mean": 0.72357177734375, + "rewards//std": 0.030247613787651062, + "step": 2637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5276, + "grad_norm": 7.187541961669922, + "kl": 2.127643883228302, + "learning_rate": 4.644886722257904e-07, + "loss": 0.2128, + "num_tokens": 22799161.0, + "reward": 0.73382568359375, + "reward_std": 0.009539835155010223, + "rewards//mean": 0.73382568359375, + "rewards//std": 0.04334123060107231, + "step": 2638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5278, + "grad_norm": 13.125093460083008, + "kl": 2.5189852826297283, + "learning_rate": 4.641721481669041e-07, + "loss": 0.2519, + "num_tokens": 22807809.0, + "reward": 0.75714111328125, + "reward_std": 0.010086900554597378, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.03255993127822876, + "step": 2639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.528, + "grad_norm": 3.9278106689453125, + "kl": 1.8151211068034172, + "learning_rate": 4.638556385394721e-07, + "loss": 0.1815, + "num_tokens": 22816577.0, + "reward": 0.77117919921875, + "reward_std": 0.015506814233958721, + "rewards//mean": 0.77117919921875, + "rewards//std": 0.029641490429639816, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5282, + "grad_norm": 6.739706516265869, + "kl": 2.6841716319322586, + "learning_rate": 4.6353914347098467e-07, + "loss": 0.2684, + "num_tokens": 22825233.0, + "reward": 0.77288818359375, + "reward_std": 0.012281034141778946, + "rewards//mean": 0.77288818359375, + "rewards//std": 0.022476011887192726, + "step": 2641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5284, + "grad_norm": 2.2411112785339355, + "kl": 1.5739174075424671, + "learning_rate": 4.6322266308892577e-07, + "loss": 0.1574, + "num_tokens": 22833889.0, + "reward": 0.76177978515625, + "reward_std": 0.012923799455165863, + "rewards//mean": 0.76177978515625, + "rewards//std": 0.03706115856766701, + "step": 2642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5286, + "grad_norm": 2.849498987197876, + "kl": 1.9340341314673424, + "learning_rate": 4.6290619752077394e-07, + "loss": 0.1934, + "num_tokens": 22842521.0, + "reward": 0.77630615234375, + "reward_std": 0.010408131405711174, + "rewards//mean": 0.77630615234375, + "rewards//std": 0.026948876678943634, + "step": 2643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5288, + "grad_norm": 1.8592721223831177, + "kl": 0.9634610544890165, + "learning_rate": 4.6258974689400113e-07, + "loss": 0.0963, + "num_tokens": 22851201.0, + "reward": 0.73114013671875, + "reward_std": 0.0016744968015700579, + "rewards//mean": 0.73114013671875, + "rewards//std": 0.02912840060889721, + "step": 2644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.529, + "grad_norm": 2.7396230697631836, + "kl": 0.9827704522758722, + "learning_rate": 4.6227331133607394e-07, + "loss": 0.0983, + "num_tokens": 22859737.0, + "reward": 0.7640380859375, + "reward_std": 0.01120184175670147, + "rewards//mean": 0.7640380859375, + "rewards//std": 0.03607579320669174, + "step": 2645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5292, + "grad_norm": 7.750306129455566, + "kl": 1.9698520340025425, + "learning_rate": 4.6195689097445236e-07, + "loss": 0.197, + "num_tokens": 22868297.0, + "reward": 0.7430419921875, + "reward_std": 0.013106929138302803, + "rewards//mean": 0.7430419921875, + "rewards//std": 0.02906941995024681, + "step": 2646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5294, + "grad_norm": 5.4731340408325195, + "kl": 1.6173828318715096, + "learning_rate": 4.6164048593659065e-07, + "loss": 0.1617, + "num_tokens": 22876945.0, + "reward": 0.7125244140625, + "reward_std": 0.009482982568442822, + "rewards//mean": 0.7125244140625, + "rewards//std": 0.0366816520690918, + "step": 2647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5296, + "grad_norm": 4.8560075759887695, + "kl": 1.7981848828494549, + "learning_rate": 4.6132409634993645e-07, + "loss": 0.1798, + "num_tokens": 22885617.0, + "reward": 0.75762939453125, + "reward_std": 0.01627742126584053, + "rewards//mean": 0.75762939453125, + "rewards//std": 0.03539607673883438, + "step": 2648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5298, + "grad_norm": 10.460521697998047, + "kl": 1.8477340806275606, + "learning_rate": 4.610077223419318e-07, + "loss": 0.1848, + "num_tokens": 22894225.0, + "reward": 0.73931884765625, + "reward_std": 0.009584047831594944, + "rewards//mean": 0.73931884765625, + "rewards//std": 0.026550481095910072, + "step": 2649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.53, + "grad_norm": 7.293250560760498, + "kl": 2.46062034368515, + "learning_rate": 4.606913640400117e-07, + "loss": 0.2461, + "num_tokens": 22903033.0, + "reward": 0.75750732421875, + "reward_std": 0.01208202913403511, + "rewards//mean": 0.75750732421875, + "rewards//std": 0.038902852684259415, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5302, + "grad_norm": 2.6933183670043945, + "kl": 1.1261982880532742, + "learning_rate": 4.6037502157160567e-07, + "loss": 0.1126, + "num_tokens": 22911681.0, + "reward": 0.78125, + "reward_std": 0.006688940338790417, + "rewards//mean": 0.78125, + "rewards//std": 0.025094378739595413, + "step": 2651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5304, + "grad_norm": 5.122379302978516, + "kl": 1.0481851138174534, + "learning_rate": 4.6005869506413615e-07, + "loss": 0.1048, + "num_tokens": 22920401.0, + "reward": 0.76702880859375, + "reward_std": 0.008705508895218372, + "rewards//mean": 0.76702880859375, + "rewards//std": 0.02645109035074711, + "step": 2652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5306, + "grad_norm": 2.105739116668701, + "kl": 1.1116899531334639, + "learning_rate": 4.5974238464501954e-07, + "loss": 0.1112, + "num_tokens": 22928977.0, + "reward": 0.7816162109375, + "reward_std": 0.008741901256144047, + "rewards//mean": 0.7816162109375, + "rewards//std": 0.0289964247494936, + "step": 2653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5308, + "grad_norm": 4.543096542358398, + "kl": 0.8436103109270334, + "learning_rate": 4.594260904416655e-07, + "loss": 0.0844, + "num_tokens": 22937593.0, + "reward": 0.79400634765625, + "reward_std": 0.0078127421438694, + "rewards//mean": 0.79400634765625, + "rewards//std": 0.02452070824801922, + "step": 2654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.531, + "grad_norm": 4.440617561340332, + "kl": 1.7474451959133148, + "learning_rate": 4.591098125814776e-07, + "loss": 0.1747, + "num_tokens": 22946297.0, + "reward": 0.78424072265625, + "reward_std": 0.00798825453966856, + "rewards//mean": 0.78424072265625, + "rewards//std": 0.0337161086499691, + "step": 2655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5312, + "grad_norm": 3.4377551078796387, + "kl": 0.9144775699824095, + "learning_rate": 4.58793551191852e-07, + "loss": 0.0914, + "num_tokens": 22954905.0, + "reward": 0.75762939453125, + "reward_std": 0.00745496666058898, + "rewards//mean": 0.75762939453125, + "rewards//std": 0.027049237862229347, + "step": 2656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5314, + "grad_norm": 2.3190839290618896, + "kl": 1.4730648174881935, + "learning_rate": 4.584773064001792e-07, + "loss": 0.1473, + "num_tokens": 22963561.0, + "reward": 0.7723388671875, + "reward_std": 0.011345259845256805, + "rewards//mean": 0.7723388671875, + "rewards//std": 0.024264797568321228, + "step": 2657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5316, + "grad_norm": 4.331188201904297, + "kl": 1.7248932719230652, + "learning_rate": 4.5816107833384233e-07, + "loss": 0.1725, + "num_tokens": 22972249.0, + "reward": 0.7626953125, + "reward_std": 0.01030290499329567, + "rewards//mean": 0.7626953125, + "rewards//std": 0.04229552671313286, + "step": 2658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5318, + "grad_norm": 5.53788423538208, + "kl": 1.7243175022304058, + "learning_rate": 4.5784486712021817e-07, + "loss": 0.1724, + "num_tokens": 22980897.0, + "reward": 0.744140625, + "reward_std": 0.007420975714921951, + "rewards//mean": 0.744140625, + "rewards//std": 0.029700225219130516, + "step": 2659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.532, + "grad_norm": 4.1407790184021, + "kl": 1.503475233912468, + "learning_rate": 4.575286728866764e-07, + "loss": 0.1503, + "num_tokens": 22989537.0, + "reward": 0.77044677734375, + "reward_std": 0.011224744841456413, + "rewards//mean": 0.77044677734375, + "rewards//std": 0.025861937552690506, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5322, + "grad_norm": 1.9548370838165283, + "kl": 1.2167020663619041, + "learning_rate": 4.5721249576058027e-07, + "loss": 0.1217, + "num_tokens": 22998113.0, + "reward": 0.7508544921875, + "reward_std": 0.00693829171359539, + "rewards//mean": 0.7508544921875, + "rewards//std": 0.029769206419587135, + "step": 2661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5324, + "grad_norm": 6.217153549194336, + "kl": 2.601410161703825, + "learning_rate": 4.568963358692856e-07, + "loss": 0.2601, + "num_tokens": 23006897.0, + "reward": 0.72076416015625, + "reward_std": 0.013574357144534588, + "rewards//mean": 0.72076416015625, + "rewards//std": 0.03701211139559746, + "step": 2662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5326, + "grad_norm": 3.614361047744751, + "kl": 0.8756587207317352, + "learning_rate": 4.565801933401417e-07, + "loss": 0.0876, + "num_tokens": 23015529.0, + "reward": 0.77001953125, + "reward_std": 0.004779680632054806, + "rewards//mean": 0.77001953125, + "rewards//std": 0.02426510863006115, + "step": 2663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5328, + "grad_norm": 3.770618438720703, + "kl": 1.2618157379329205, + "learning_rate": 4.562640683004907e-07, + "loss": 0.1262, + "num_tokens": 23024185.0, + "reward": 0.76129150390625, + "reward_std": 0.0064302366226911545, + "rewards//mean": 0.76129150390625, + "rewards//std": 0.02267182245850563, + "step": 2664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.533, + "grad_norm": 6.808322906494141, + "kl": 1.8657021410763264, + "learning_rate": 4.5594796087766787e-07, + "loss": 0.1866, + "num_tokens": 23032865.0, + "reward": 0.73724365234375, + "reward_std": 0.010692494921386242, + "rewards//mean": 0.73724365234375, + "rewards//std": 0.03692202270030975, + "step": 2665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5332, + "grad_norm": 2.354804039001465, + "kl": 1.425453096628189, + "learning_rate": 4.55631871199001e-07, + "loss": 0.1425, + "num_tokens": 23041585.0, + "reward": 0.76904296875, + "reward_std": 0.009244291111826897, + "rewards//mean": 0.76904296875, + "rewards//std": 0.028835445642471313, + "step": 2666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5334, + "grad_norm": 5.746089935302734, + "kl": 2.4229010492563248, + "learning_rate": 4.553157993918112e-07, + "loss": 0.2423, + "num_tokens": 23050337.0, + "reward": 0.76318359375, + "reward_std": 0.01093886699527502, + "rewards//mean": 0.76318359375, + "rewards//std": 0.03588583692908287, + "step": 2667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5336, + "grad_norm": 0.7911232709884644, + "kl": 0.6595764961093664, + "learning_rate": 4.5499974558341206e-07, + "loss": 0.066, + "num_tokens": 23058961.0, + "reward": 0.77996826171875, + "reward_std": 0.003970570396631956, + "rewards//mean": 0.77996826171875, + "rewards//std": 0.02069399505853653, + "step": 2668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5338, + "grad_norm": 1.313047170639038, + "kl": 0.5919544007629156, + "learning_rate": 4.5468370990110997e-07, + "loss": 0.0592, + "num_tokens": 23067521.0, + "reward": 0.75335693359375, + "reward_std": 0.003562675788998604, + "rewards//mean": 0.75335693359375, + "rewards//std": 0.02064492553472519, + "step": 2669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.534, + "grad_norm": 2.981106996536255, + "kl": 1.0191421695053577, + "learning_rate": 4.543676924722042e-07, + "loss": 0.1019, + "num_tokens": 23076249.0, + "reward": 0.76123046875, + "reward_std": 0.0099929915741086, + "rewards//mean": 0.76123046875, + "rewards//std": 0.022814344614744186, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5342, + "grad_norm": 2.1800875663757324, + "kl": 0.7210257966071367, + "learning_rate": 4.540516934239863e-07, + "loss": 0.0721, + "num_tokens": 23084785.0, + "reward": 0.7523193359375, + "reward_std": 0.006631495431065559, + "rewards//mean": 0.7523193359375, + "rewards//std": 0.03262060508131981, + "step": 2671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5344, + "grad_norm": 3.2396016120910645, + "kl": 1.0751785542815924, + "learning_rate": 4.5373571288374097e-07, + "loss": 0.1075, + "num_tokens": 23093425.0, + "reward": 0.72998046875, + "reward_std": 0.004825330805033445, + "rewards//mean": 0.72998046875, + "rewards//std": 0.029260683804750443, + "step": 2672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5346, + "grad_norm": 1.026073694229126, + "kl": 1.1247605420649052, + "learning_rate": 4.534197509787448e-07, + "loss": 0.1125, + "num_tokens": 23102073.0, + "reward": 0.7698974609375, + "reward_std": 0.00820164568722248, + "rewards//mean": 0.7698974609375, + "rewards//std": 0.030794980004429817, + "step": 2673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5348, + "grad_norm": 6.963526725769043, + "kl": 2.4781657494604588, + "learning_rate": 4.5310380783626747e-07, + "loss": 0.2478, + "num_tokens": 23110833.0, + "reward": 0.69793701171875, + "reward_std": 0.014291295781731606, + "rewards//mean": 0.69793701171875, + "rewards//std": 0.04735337570309639, + "step": 2674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.535, + "grad_norm": 2.6351962089538574, + "kl": 1.824907023459673, + "learning_rate": 4.527878835835706e-07, + "loss": 0.1825, + "num_tokens": 23119449.0, + "reward": 0.744140625, + "reward_std": 0.008462807163596153, + "rewards//mean": 0.744140625, + "rewards//std": 0.03580813854932785, + "step": 2675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5352, + "grad_norm": 7.313758850097656, + "kl": 2.2280320301651955, + "learning_rate": 4.5247197834790873e-07, + "loss": 0.2228, + "num_tokens": 23128065.0, + "reward": 0.77630615234375, + "reward_std": 0.011128314770758152, + "rewards//mean": 0.77630615234375, + "rewards//std": 0.023758163675665855, + "step": 2676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5354, + "grad_norm": 2.602595567703247, + "kl": 0.9010493084788322, + "learning_rate": 4.5215609225652817e-07, + "loss": 0.0901, + "num_tokens": 23136625.0, + "reward": 0.76556396484375, + "reward_std": 0.006668279878795147, + "rewards//mean": 0.76556396484375, + "rewards//std": 0.026470541954040527, + "step": 2677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5356, + "grad_norm": 3.615633010864258, + "kl": 0.6364397667348385, + "learning_rate": 4.5184022543666806e-07, + "loss": 0.0636, + "num_tokens": 23145273.0, + "reward": 0.76177978515625, + "reward_std": 0.005356741603463888, + "rewards//mean": 0.76177978515625, + "rewards//std": 0.028008779510855675, + "step": 2678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5358, + "grad_norm": 1.659595251083374, + "kl": 1.332785863429308, + "learning_rate": 4.5152437801555926e-07, + "loss": 0.1333, + "num_tokens": 23153873.0, + "reward": 0.74969482421875, + "reward_std": 0.006395323202013969, + "rewards//mean": 0.74969482421875, + "rewards//std": 0.025565823540091515, + "step": 2679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.536, + "grad_norm": 3.4136128425598145, + "kl": 1.971382049843669, + "learning_rate": 4.512085501204253e-07, + "loss": 0.1971, + "num_tokens": 23162593.0, + "reward": 0.76483154296875, + "reward_std": 0.018675558269023895, + "rewards//mean": 0.76483154296875, + "rewards//std": 0.03603186458349228, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5362, + "grad_norm": 3.2165894508361816, + "kl": 1.6427891366183758, + "learning_rate": 4.508927418784814e-07, + "loss": 0.1643, + "num_tokens": 23171273.0, + "reward": 0.7579345703125, + "reward_std": 0.010650633834302425, + "rewards//mean": 0.7579345703125, + "rewards//std": 0.026106400415301323, + "step": 2681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5364, + "grad_norm": 3.5290915966033936, + "kl": 0.9826488550752401, + "learning_rate": 4.5057695341693536e-07, + "loss": 0.0983, + "num_tokens": 23180009.0, + "reward": 0.7449951171875, + "reward_std": 0.0032529656309634447, + "rewards//mean": 0.7449951171875, + "rewards//std": 0.03166934847831726, + "step": 2682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5366, + "grad_norm": 1.5096626281738281, + "kl": 1.437236299738288, + "learning_rate": 4.502611848629865e-07, + "loss": 0.1437, + "num_tokens": 23188777.0, + "reward": 0.76531982421875, + "reward_std": 0.00798747967928648, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.034183841198682785, + "step": 2683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5368, + "grad_norm": 2.5073795318603516, + "kl": 1.5426080748438835, + "learning_rate": 4.499454363438264e-07, + "loss": 0.1543, + "num_tokens": 23197465.0, + "reward": 0.79486083984375, + "reward_std": 0.012999268248677254, + "rewards//mean": 0.79486083984375, + "rewards//std": 0.03199290856719017, + "step": 2684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.537, + "grad_norm": 3.9387192726135254, + "kl": 2.607405947521329, + "learning_rate": 4.496297079866386e-07, + "loss": 0.2607, + "num_tokens": 23206081.0, + "reward": 0.7633056640625, + "reward_std": 0.01925293169915676, + "rewards//mean": 0.7633056640625, + "rewards//std": 0.037101831287145615, + "step": 2685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5372, + "grad_norm": 2.0134117603302, + "kl": 1.4562037494033575, + "learning_rate": 4.4931399991859833e-07, + "loss": 0.1456, + "num_tokens": 23214673.0, + "reward": 0.74029541015625, + "reward_std": 0.007854134775698185, + "rewards//mean": 0.74029541015625, + "rewards//std": 0.022509662434458733, + "step": 2686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5374, + "grad_norm": 2.599942922592163, + "kl": 1.3621915020048618, + "learning_rate": 4.489983122668729e-07, + "loss": 0.1362, + "num_tokens": 23223257.0, + "reward": 0.7415771484375, + "reward_std": 0.007936595007777214, + "rewards//mean": 0.7415771484375, + "rewards//std": 0.025460582226514816, + "step": 2687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5376, + "grad_norm": 1.3719426393508911, + "kl": 0.975915901362896, + "learning_rate": 4.486826451586211e-07, + "loss": 0.0976, + "num_tokens": 23231849.0, + "reward": 0.7584228515625, + "reward_std": 0.00714164087548852, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.03209288418292999, + "step": 2688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5378, + "grad_norm": 2.3303065299987793, + "kl": 1.1816036067903042, + "learning_rate": 4.483669987209938e-07, + "loss": 0.1182, + "num_tokens": 23240457.0, + "reward": 0.76312255859375, + "reward_std": 0.00967983715236187, + "rewards//mean": 0.76312255859375, + "rewards//std": 0.03221452236175537, + "step": 2689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.538, + "grad_norm": 2.550154447555542, + "kl": 1.0380831230431795, + "learning_rate": 4.4805137308113315e-07, + "loss": 0.1038, + "num_tokens": 23249129.0, + "reward": 0.784423828125, + "reward_std": 0.004327394533902407, + "rewards//mean": 0.784423828125, + "rewards//std": 0.018699748441576958, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5382, + "grad_norm": 1.688706874847412, + "kl": 1.7687797043472528, + "learning_rate": 4.477357683661733e-07, + "loss": 0.1769, + "num_tokens": 23257857.0, + "reward": 0.762451171875, + "reward_std": 0.011334855109453201, + "rewards//mean": 0.762451171875, + "rewards//std": 0.03827248886227608, + "step": 2691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5384, + "grad_norm": 4.354102611541748, + "kl": 0.709210928529501, + "learning_rate": 4.474201847032396e-07, + "loss": 0.0709, + "num_tokens": 23266465.0, + "reward": 0.7530517578125, + "reward_std": 0.0034194316249340773, + "rewards//mean": 0.7530517578125, + "rewards//std": 0.03253696858882904, + "step": 2692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5386, + "grad_norm": 3.246397018432617, + "kl": 0.6598598770797253, + "learning_rate": 4.4710462221944936e-07, + "loss": 0.066, + "num_tokens": 23275113.0, + "reward": 0.77587890625, + "reward_std": 0.001830365275964141, + "rewards//mean": 0.77587890625, + "rewards//std": 0.029285505414009094, + "step": 2693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5388, + "grad_norm": 2.9002163410186768, + "kl": 1.3007855266332626, + "learning_rate": 4.4678908104191076e-07, + "loss": 0.1301, + "num_tokens": 23283785.0, + "reward": 0.75634765625, + "reward_std": 0.006654093973338604, + "rewards//mean": 0.75634765625, + "rewards//std": 0.027599429711699486, + "step": 2694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.539, + "grad_norm": 1.4071025848388672, + "kl": 1.2403850760310888, + "learning_rate": 4.464735612977242e-07, + "loss": 0.124, + "num_tokens": 23292401.0, + "reward": 0.77960205078125, + "reward_std": 0.006884078029543161, + "rewards//mean": 0.77960205078125, + "rewards//std": 0.020854303613305092, + "step": 2695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5392, + "grad_norm": 4.144754886627197, + "kl": 1.8681033495813608, + "learning_rate": 4.4615806311398055e-07, + "loss": 0.1868, + "num_tokens": 23301025.0, + "reward": 0.76177978515625, + "reward_std": 0.011058426462113857, + "rewards//mean": 0.76177978515625, + "rewards//std": 0.024651238694787025, + "step": 2696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5394, + "grad_norm": 4.118155479431152, + "kl": 0.9806359969079494, + "learning_rate": 4.458425866177627e-07, + "loss": 0.0981, + "num_tokens": 23309657.0, + "reward": 0.75872802734375, + "reward_std": 0.004982184153050184, + "rewards//mean": 0.75872802734375, + "rewards//std": 0.0183713398873806, + "step": 2697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5396, + "grad_norm": 2.053034782409668, + "kl": 0.9947618339210749, + "learning_rate": 4.4552713193614443e-07, + "loss": 0.0995, + "num_tokens": 23318321.0, + "reward": 0.7777099609375, + "reward_std": 0.00503022875636816, + "rewards//mean": 0.7777099609375, + "rewards//std": 0.02327904850244522, + "step": 2698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5398, + "grad_norm": 4.671300411224365, + "kl": 1.2800943665206432, + "learning_rate": 4.45211699196191e-07, + "loss": 0.128, + "num_tokens": 23326905.0, + "reward": 0.7608642578125, + "reward_std": 0.018226834014058113, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.03440393880009651, + "step": 2699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.54, + "grad_norm": 3.0310757160186768, + "kl": 1.1680125258862972, + "learning_rate": 4.448962885249586e-07, + "loss": 0.1168, + "num_tokens": 23335497.0, + "reward": 0.75567626953125, + "reward_std": 0.00779640581458807, + "rewards//mean": 0.75567626953125, + "rewards//std": 0.02917305752635002, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5402, + "grad_norm": 9.918130874633789, + "kl": 1.4426944460719824, + "learning_rate": 4.445809000494945e-07, + "loss": 0.1443, + "num_tokens": 23344201.0, + "reward": 0.79046630859375, + "reward_std": 0.006647826172411442, + "rewards//mean": 0.79046630859375, + "rewards//std": 0.019603874534368515, + "step": 2701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5404, + "grad_norm": 6.583772659301758, + "kl": 2.454082516953349, + "learning_rate": 4.442655338968373e-07, + "loss": 0.2454, + "num_tokens": 23352873.0, + "reward": 0.7977294921875, + "reward_std": 0.017155012115836143, + "rewards//mean": 0.7977294921875, + "rewards//std": 0.0335485078394413, + "step": 2702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5406, + "grad_norm": 4.40888786315918, + "kl": 1.8387727569788694, + "learning_rate": 4.439501901940163e-07, + "loss": 0.1839, + "num_tokens": 23361601.0, + "reward": 0.7467041015625, + "reward_std": 0.008022364228963852, + "rewards//mean": 0.7467041015625, + "rewards//std": 0.022758208215236664, + "step": 2703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5408, + "grad_norm": 7.131270408630371, + "kl": 1.7035969235002995, + "learning_rate": 4.436348690680521e-07, + "loss": 0.1704, + "num_tokens": 23370257.0, + "reward": 0.76123046875, + "reward_std": 0.008089488372206688, + "rewards//mean": 0.76123046875, + "rewards//std": 0.024543000385165215, + "step": 2704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.541, + "grad_norm": 1.6261810064315796, + "kl": 0.8756543416529894, + "learning_rate": 4.4331957064595575e-07, + "loss": 0.0876, + "num_tokens": 23378881.0, + "reward": 0.7393798828125, + "reward_std": 0.006033864803612232, + "rewards//mean": 0.7393798828125, + "rewards//std": 0.0251278355717659, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5412, + "grad_norm": 3.6621451377868652, + "kl": 1.5106176864355803, + "learning_rate": 4.430042950547297e-07, + "loss": 0.1511, + "num_tokens": 23387497.0, + "reward": 0.7606201171875, + "reward_std": 0.011936716735363007, + "rewards//mean": 0.7606201171875, + "rewards//std": 0.02714790217578411, + "step": 2706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5414, + "grad_norm": 9.66750717163086, + "kl": 2.7674637977033854, + "learning_rate": 4.4268904242136667e-07, + "loss": 0.2767, + "num_tokens": 23396129.0, + "reward": 0.74932861328125, + "reward_std": 0.00897124968469143, + "rewards//mean": 0.74932861328125, + "rewards//std": 0.02578514628112316, + "step": 2707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5416, + "grad_norm": 1.1957478523254395, + "kl": 1.4280615337193012, + "learning_rate": 4.4237381287285064e-07, + "loss": 0.1428, + "num_tokens": 23404777.0, + "reward": 0.75604248046875, + "reward_std": 0.00974909495562315, + "rewards//mean": 0.75604248046875, + "rewards//std": 0.019929347559809685, + "step": 2708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5418, + "grad_norm": 3.3371119499206543, + "kl": 1.003537155687809, + "learning_rate": 4.420586065361557e-07, + "loss": 0.1004, + "num_tokens": 23413473.0, + "reward": 0.76434326171875, + "reward_std": 0.007077973335981369, + "rewards//mean": 0.76434326171875, + "rewards//std": 0.030671535059809685, + "step": 2709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.542, + "grad_norm": 1.8586697578430176, + "kl": 1.2356816120445728, + "learning_rate": 4.4174342353824736e-07, + "loss": 0.1236, + "num_tokens": 23422177.0, + "reward": 0.76422119140625, + "reward_std": 0.005549092311412096, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.028385648503899574, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5422, + "grad_norm": 2.9504032135009766, + "kl": 2.604219987988472, + "learning_rate": 4.4142826400608085e-07, + "loss": 0.2604, + "num_tokens": 23430809.0, + "reward": 0.7301025390625, + "reward_std": 0.014678099192678928, + "rewards//mean": 0.7301025390625, + "rewards//std": 0.031494878232479095, + "step": 2711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5424, + "grad_norm": 1.698063850402832, + "kl": 1.301852973178029, + "learning_rate": 4.411131280666027e-07, + "loss": 0.1302, + "num_tokens": 23439433.0, + "reward": 0.7471923828125, + "reward_std": 0.0098424656316638, + "rewards//mean": 0.7471923828125, + "rewards//std": 0.03998458385467529, + "step": 2712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5426, + "grad_norm": 2.6074020862579346, + "kl": 1.374724930152297, + "learning_rate": 4.407980158467495e-07, + "loss": 0.1375, + "num_tokens": 23448017.0, + "reward": 0.697021484375, + "reward_std": 0.005433548241853714, + "rewards//mean": 0.697021484375, + "rewards//std": 0.035535700619220734, + "step": 2713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5428, + "grad_norm": 7.269412517547607, + "kl": 1.3516736701130867, + "learning_rate": 4.4048292747344844e-07, + "loss": 0.1352, + "num_tokens": 23456657.0, + "reward": 0.771240234375, + "reward_std": 0.010234792716801167, + "rewards//mean": 0.771240234375, + "rewards//std": 0.027302753180265427, + "step": 2714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.543, + "grad_norm": 2.7812976837158203, + "kl": 1.4249127954244614, + "learning_rate": 4.4016786307361715e-07, + "loss": 0.1425, + "num_tokens": 23465313.0, + "reward": 0.75628662109375, + "reward_std": 0.008593153208494186, + "rewards//mean": 0.75628662109375, + "rewards//std": 0.029344309121370316, + "step": 2715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5432, + "grad_norm": 3.0851025581359863, + "kl": 1.3093355186283588, + "learning_rate": 4.398528227741633e-07, + "loss": 0.1309, + "num_tokens": 23473961.0, + "reward": 0.77862548828125, + "reward_std": 0.010351684875786304, + "rewards//mean": 0.77862548828125, + "rewards//std": 0.02718878537416458, + "step": 2716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5434, + "grad_norm": 3.4834866523742676, + "kl": 1.3279999196529388, + "learning_rate": 4.3953780670198534e-07, + "loss": 0.1328, + "num_tokens": 23482585.0, + "reward": 0.78076171875, + "reward_std": 0.010690304450690746, + "rewards//mean": 0.78076171875, + "rewards//std": 0.03149319440126419, + "step": 2717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5436, + "grad_norm": 2.6697263717651367, + "kl": 0.9720768127590418, + "learning_rate": 4.392228149839716e-07, + "loss": 0.0972, + "num_tokens": 23491225.0, + "reward": 0.757568359375, + "reward_std": 0.008404724299907684, + "rewards//mean": 0.757568359375, + "rewards//std": 0.026592710986733437, + "step": 2718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5438, + "grad_norm": 3.582355499267578, + "kl": 1.093993017449975, + "learning_rate": 4.389078477470007e-07, + "loss": 0.1094, + "num_tokens": 23499857.0, + "reward": 0.7861328125, + "reward_std": 0.009061952121555805, + "rewards//mean": 0.7861328125, + "rewards//std": 0.035625044256448746, + "step": 2719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.544, + "grad_norm": 3.856337070465088, + "kl": 1.435177929699421, + "learning_rate": 4.385929051179414e-07, + "loss": 0.1435, + "num_tokens": 23508457.0, + "reward": 0.75872802734375, + "reward_std": 0.006233691703528166, + "rewards//mean": 0.75872802734375, + "rewards//std": 0.03365859389305115, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5442, + "grad_norm": 1.8299291133880615, + "kl": 1.3630895018577576, + "learning_rate": 4.382779872236526e-07, + "loss": 0.1363, + "num_tokens": 23517137.0, + "reward": 0.76715087890625, + "reward_std": 0.008637620136141777, + "rewards//mean": 0.76715087890625, + "rewards//std": 0.03640925511717796, + "step": 2721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5444, + "grad_norm": 3.9812753200531006, + "kl": 1.6540025491267443, + "learning_rate": 4.3796309419098315e-07, + "loss": 0.1654, + "num_tokens": 23525849.0, + "reward": 0.7716064453125, + "reward_std": 0.007957818917930126, + "rewards//mean": 0.7716064453125, + "rewards//std": 0.030568024143576622, + "step": 2722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5446, + "grad_norm": 6.02662467956543, + "kl": 1.7502205781638622, + "learning_rate": 4.37648226146772e-07, + "loss": 0.175, + "num_tokens": 23534481.0, + "reward": 0.7540283203125, + "reward_std": 0.010076741687953472, + "rewards//mean": 0.7540283203125, + "rewards//std": 0.036551013588905334, + "step": 2723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5448, + "grad_norm": 2.8278489112854004, + "kl": 2.0870148707181215, + "learning_rate": 4.3733338321784777e-07, + "loss": 0.2087, + "num_tokens": 23543049.0, + "reward": 0.72650146484375, + "reward_std": 0.010426721535623074, + "rewards//mean": 0.72650146484375, + "rewards//std": 0.03966238349676132, + "step": 2724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.545, + "grad_norm": 10.2551851272583, + "kl": 1.139538599178195, + "learning_rate": 4.3701856553102943e-07, + "loss": 0.114, + "num_tokens": 23551721.0, + "reward": 0.76171875, + "reward_std": 0.011692647822201252, + "rewards//mean": 0.76171875, + "rewards//std": 0.030480990186333656, + "step": 2725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5452, + "grad_norm": 3.6158030033111572, + "kl": 1.4823046028614044, + "learning_rate": 4.367037732131253e-07, + "loss": 0.1482, + "num_tokens": 23560313.0, + "reward": 0.76495361328125, + "reward_std": 0.007985929027199745, + "rewards//mean": 0.76495361328125, + "rewards//std": 0.02158200368285179, + "step": 2726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5454, + "grad_norm": 5.481783390045166, + "kl": 1.5257047284394503, + "learning_rate": 4.363890063909338e-07, + "loss": 0.1526, + "num_tokens": 23569009.0, + "reward": 0.7205810546875, + "reward_std": 0.008970895782113075, + "rewards//mean": 0.7205810546875, + "rewards//std": 0.037917040288448334, + "step": 2727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5456, + "grad_norm": 5.616946220397949, + "kl": 2.0938849095255136, + "learning_rate": 4.360742651912428e-07, + "loss": 0.2094, + "num_tokens": 23577753.0, + "reward": 0.76446533203125, + "reward_std": 0.014405623078346252, + "rewards//mean": 0.76446533203125, + "rewards//std": 0.02611299604177475, + "step": 2728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5458, + "grad_norm": 7.884521484375, + "kl": 1.5158877614885569, + "learning_rate": 4.357595497408303e-07, + "loss": 0.1516, + "num_tokens": 23586305.0, + "reward": 0.75994873046875, + "reward_std": 0.005395525135099888, + "rewards//mean": 0.75994873046875, + "rewards//std": 0.03952436521649361, + "step": 2729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.546, + "grad_norm": 5.061817169189453, + "kl": 1.5064756069332361, + "learning_rate": 4.354448601664633e-07, + "loss": 0.1506, + "num_tokens": 23594905.0, + "reward": 0.77630615234375, + "reward_std": 0.01162352692335844, + "rewards//mean": 0.77630615234375, + "rewards//std": 0.028571689501404762, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5462, + "grad_norm": 14.948163032531738, + "kl": 2.856360137462616, + "learning_rate": 4.3513019659489906e-07, + "loss": 0.2856, + "num_tokens": 23603553.0, + "reward": 0.728759765625, + "reward_std": 0.01317685842514038, + "rewards//mean": 0.728759765625, + "rewards//std": 0.03824083134531975, + "step": 2731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5464, + "grad_norm": 2.8378334045410156, + "kl": 1.2864750884473324, + "learning_rate": 4.3481555915288384e-07, + "loss": 0.1286, + "num_tokens": 23612209.0, + "reward": 0.7855224609375, + "reward_std": 0.010509679093956947, + "rewards//mean": 0.7855224609375, + "rewards//std": 0.03323844075202942, + "step": 2732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5466, + "grad_norm": 6.774303913116455, + "kl": 2.6692296229302883, + "learning_rate": 4.345009479671535e-07, + "loss": 0.2669, + "num_tokens": 23620993.0, + "reward": 0.76153564453125, + "reward_std": 0.016014590859413147, + "rewards//mean": 0.76153564453125, + "rewards//std": 0.041500259190797806, + "step": 2733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5468, + "grad_norm": 5.780099868774414, + "kl": 1.337006539106369, + "learning_rate": 4.3418636316443365e-07, + "loss": 0.1337, + "num_tokens": 23629529.0, + "reward": 0.745361328125, + "reward_std": 0.005393799860030413, + "rewards//mean": 0.745361328125, + "rewards//std": 0.031741201877593994, + "step": 2734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.547, + "grad_norm": 5.696723461151123, + "kl": 1.7169923409819603, + "learning_rate": 4.338718048714387e-07, + "loss": 0.1717, + "num_tokens": 23638081.0, + "reward": 0.77117919921875, + "reward_std": 0.007427798584103584, + "rewards//mean": 0.77117919921875, + "rewards//std": 0.023269537836313248, + "step": 2735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5472, + "grad_norm": 3.815852165222168, + "kl": 2.182823557406664, + "learning_rate": 4.3355727321487297e-07, + "loss": 0.2183, + "num_tokens": 23646713.0, + "reward": 0.7689208984375, + "reward_std": 0.016301535069942474, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.03707897290587425, + "step": 2736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5474, + "grad_norm": 3.664146900177002, + "kl": 1.1942524537444115, + "learning_rate": 4.332427683214295e-07, + "loss": 0.1194, + "num_tokens": 23655401.0, + "reward": 0.77069091796875, + "reward_std": 0.009241471067070961, + "rewards//mean": 0.77069091796875, + "rewards//std": 0.02414683625102043, + "step": 2737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5476, + "grad_norm": 8.04285717010498, + "kl": 2.009334960952401, + "learning_rate": 4.329282903177911e-07, + "loss": 0.2009, + "num_tokens": 23664065.0, + "reward": 0.72760009765625, + "reward_std": 0.010237504728138447, + "rewards//mean": 0.72760009765625, + "rewards//std": 0.03668510913848877, + "step": 2738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5478, + "grad_norm": 9.715472221374512, + "kl": 2.1735199224203825, + "learning_rate": 4.3261383933062916e-07, + "loss": 0.2174, + "num_tokens": 23672705.0, + "reward": 0.7611083984375, + "reward_std": 0.009888289496302605, + "rewards//mean": 0.7611083984375, + "rewards//std": 0.035796087235212326, + "step": 2739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.548, + "grad_norm": 3.495805025100708, + "kl": 1.800416436046362, + "learning_rate": 4.32299415486605e-07, + "loss": 0.18, + "num_tokens": 23681377.0, + "reward": 0.73565673828125, + "reward_std": 0.010325726121664047, + "rewards//mean": 0.73565673828125, + "rewards//std": 0.0332481749355793, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5482, + "grad_norm": 5.147223949432373, + "kl": 1.485960878431797, + "learning_rate": 4.31985018912368e-07, + "loss": 0.1486, + "num_tokens": 23690009.0, + "reward": 0.74395751953125, + "reward_std": 0.012874758802354336, + "rewards//mean": 0.74395751953125, + "rewards//std": 0.02955249510705471, + "step": 2741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5484, + "grad_norm": 3.8885111808776855, + "kl": 1.3051352314651012, + "learning_rate": 4.316706497345572e-07, + "loss": 0.1305, + "num_tokens": 23698585.0, + "reward": 0.78887939453125, + "reward_std": 0.010362228378653526, + "rewards//mean": 0.78887939453125, + "rewards//std": 0.03163843974471092, + "step": 2742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5486, + "grad_norm": 2.123316526412964, + "kl": 1.8132994584739208, + "learning_rate": 4.313563080798006e-07, + "loss": 0.1813, + "num_tokens": 23707249.0, + "reward": 0.75726318359375, + "reward_std": 0.013624865561723709, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.03156515210866928, + "step": 2743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5488, + "grad_norm": 5.784191608428955, + "kl": 3.087730724364519, + "learning_rate": 4.3104199407471477e-07, + "loss": 0.3088, + "num_tokens": 23715921.0, + "reward": 0.75811767578125, + "reward_std": 0.02032296359539032, + "rewards//mean": 0.75811767578125, + "rewards//std": 0.0348365418612957, + "step": 2744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.549, + "grad_norm": 6.5163984298706055, + "kl": 1.503587869927287, + "learning_rate": 4.3072770784590564e-07, + "loss": 0.1504, + "num_tokens": 23724521.0, + "reward": 0.76507568359375, + "reward_std": 0.007474065758287907, + "rewards//mean": 0.76507568359375, + "rewards//std": 0.03602303937077522, + "step": 2745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5492, + "grad_norm": 1.6777963638305664, + "kl": 1.2435228023678064, + "learning_rate": 4.304134495199674e-07, + "loss": 0.1244, + "num_tokens": 23733137.0, + "reward": 0.73626708984375, + "reward_std": 0.006538215558975935, + "rewards//mean": 0.73626708984375, + "rewards//std": 0.021620547398924828, + "step": 2746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5494, + "grad_norm": 3.4300222396850586, + "kl": 1.7247562818229198, + "learning_rate": 4.3009921922348334e-07, + "loss": 0.1725, + "num_tokens": 23741745.0, + "reward": 0.74176025390625, + "reward_std": 0.013384273275732994, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.031674306839704514, + "step": 2747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5496, + "grad_norm": 4.080512046813965, + "kl": 1.5515307039022446, + "learning_rate": 4.297850170830255e-07, + "loss": 0.1552, + "num_tokens": 23750481.0, + "reward": 0.76507568359375, + "reward_std": 0.014782292768359184, + "rewards//mean": 0.76507568359375, + "rewards//std": 0.03456126153469086, + "step": 2748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5498, + "grad_norm": 4.5242438316345215, + "kl": 1.3936299681663513, + "learning_rate": 4.294708432251543e-07, + "loss": 0.1394, + "num_tokens": 23759081.0, + "reward": 0.75, + "reward_std": 0.006259226240217686, + "rewards//mean": 0.75, + "rewards//std": 0.027917902916669846, + "step": 2749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.55, + "grad_norm": 1.2146681547164917, + "kl": 0.7616769857704639, + "learning_rate": 4.291566977764191e-07, + "loss": 0.0762, + "num_tokens": 23767641.0, + "reward": 0.764892578125, + "reward_std": 0.004266746342182159, + "rewards//mean": 0.764892578125, + "rewards//std": 0.024218900129199028, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5502, + "grad_norm": 5.934704780578613, + "kl": 1.4693366009742022, + "learning_rate": 4.2884258086335745e-07, + "loss": 0.1469, + "num_tokens": 23776425.0, + "reward": 0.76123046875, + "reward_std": 0.00622732425108552, + "rewards//mean": 0.76123046875, + "rewards//std": 0.028429411351680756, + "step": 2751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5504, + "grad_norm": 2.2671287059783936, + "kl": 1.1624149791896343, + "learning_rate": 4.285284926124959e-07, + "loss": 0.1162, + "num_tokens": 23784961.0, + "reward": 0.77392578125, + "reward_std": 0.0045250169932842255, + "rewards//mean": 0.77392578125, + "rewards//std": 0.025737622752785683, + "step": 2752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5506, + "grad_norm": 4.421723365783691, + "kl": 1.2495483700186014, + "learning_rate": 4.2821443315034875e-07, + "loss": 0.125, + "num_tokens": 23793609.0, + "reward": 0.75213623046875, + "reward_std": 0.00964227132499218, + "rewards//mean": 0.75213623046875, + "rewards//std": 0.033457860350608826, + "step": 2753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5508, + "grad_norm": 3.201054573059082, + "kl": 1.1702329851686954, + "learning_rate": 4.2790040260341954e-07, + "loss": 0.117, + "num_tokens": 23802225.0, + "reward": 0.719482421875, + "reward_std": 0.004991226363927126, + "rewards//mean": 0.719482421875, + "rewards//std": 0.027977483347058296, + "step": 2754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.551, + "grad_norm": 3.423197031021118, + "kl": 0.8825385440140963, + "learning_rate": 4.2758640109819944e-07, + "loss": 0.0883, + "num_tokens": 23810761.0, + "reward": 0.80303955078125, + "reward_std": 0.007841301150619984, + "rewards//mean": 0.80303955078125, + "rewards//std": 0.01981814019382, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5512, + "grad_norm": 6.832088947296143, + "kl": 1.8967163544148207, + "learning_rate": 4.272724287611684e-07, + "loss": 0.1897, + "num_tokens": 23819353.0, + "reward": 0.72650146484375, + "reward_std": 0.007752531208097935, + "rewards//mean": 0.72650146484375, + "rewards//std": 0.03763829916715622, + "step": 2756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5514, + "grad_norm": 1.9150985479354858, + "kl": 1.271044285967946, + "learning_rate": 4.2695848571879424e-07, + "loss": 0.1271, + "num_tokens": 23828025.0, + "reward": 0.74005126953125, + "reward_std": 0.009517215192317963, + "rewards//mean": 0.74005126953125, + "rewards//std": 0.023632310330867767, + "step": 2757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5516, + "grad_norm": 6.5145416259765625, + "kl": 1.097481407225132, + "learning_rate": 4.2664457209753333e-07, + "loss": 0.1097, + "num_tokens": 23836841.0, + "reward": 0.7120361328125, + "reward_std": 0.01239514909684658, + "rewards//mean": 0.7120361328125, + "rewards//std": 0.03301727771759033, + "step": 2758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5518, + "grad_norm": 2.21148943901062, + "kl": 0.7628206219524145, + "learning_rate": 4.2633068802383004e-07, + "loss": 0.0763, + "num_tokens": 23845545.0, + "reward": 0.7481689453125, + "reward_std": 0.005524271633476019, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.027813328430056572, + "step": 2759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.552, + "grad_norm": 1.7247365713119507, + "kl": 1.4232266768813133, + "learning_rate": 4.2601683362411685e-07, + "loss": 0.1423, + "num_tokens": 23854145.0, + "reward": 0.77838134765625, + "reward_std": 0.012917826883494854, + "rewards//mean": 0.77838134765625, + "rewards//std": 0.034055620431900024, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5522, + "grad_norm": 2.0331857204437256, + "kl": 1.1604901067912579, + "learning_rate": 4.257030090248142e-07, + "loss": 0.116, + "num_tokens": 23862809.0, + "reward": 0.775390625, + "reward_std": 0.006807130295783281, + "rewards//mean": 0.775390625, + "rewards//std": 0.02055242285132408, + "step": 2761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5524, + "grad_norm": 4.687293529510498, + "kl": 2.276164021342993, + "learning_rate": 4.2538921435233053e-07, + "loss": 0.2276, + "num_tokens": 23871513.0, + "reward": 0.71893310546875, + "reward_std": 0.01136801764369011, + "rewards//mean": 0.71893310546875, + "rewards//std": 0.04244089871644974, + "step": 2762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5526, + "grad_norm": 8.995779037475586, + "kl": 1.1090340800583363, + "learning_rate": 4.2507544973306255e-07, + "loss": 0.1109, + "num_tokens": 23880121.0, + "reward": 0.78271484375, + "reward_std": 0.00763237290084362, + "rewards//mean": 0.78271484375, + "rewards//std": 0.022200971841812134, + "step": 2763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5528, + "grad_norm": 6.427226543426514, + "kl": 2.787599056959152, + "learning_rate": 4.2476171529339435e-07, + "loss": 0.2788, + "num_tokens": 23888889.0, + "reward": 0.739501953125, + "reward_std": 0.013825233094394207, + "rewards//mean": 0.739501953125, + "rewards//std": 0.036450859159231186, + "step": 2764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.553, + "grad_norm": 3.090238571166992, + "kl": 1.193793199956417, + "learning_rate": 4.244480111596983e-07, + "loss": 0.1194, + "num_tokens": 23897585.0, + "reward": 0.77752685546875, + "reward_std": 0.009557783603668213, + "rewards//mean": 0.77752685546875, + "rewards//std": 0.029874471947550774, + "step": 2765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5532, + "grad_norm": 5.262044429779053, + "kl": 1.8497429881244898, + "learning_rate": 4.241343374583342e-07, + "loss": 0.185, + "num_tokens": 23906249.0, + "reward": 0.7880859375, + "reward_std": 0.008182472549378872, + "rewards//mean": 0.7880859375, + "rewards//std": 0.02864583395421505, + "step": 2766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5534, + "grad_norm": 4.4632978439331055, + "kl": 1.8824023231863976, + "learning_rate": 4.2382069431565e-07, + "loss": 0.1882, + "num_tokens": 23914825.0, + "reward": 0.7542724609375, + "reward_std": 0.00942098256200552, + "rewards//mean": 0.7542724609375, + "rewards//std": 0.035411734133958817, + "step": 2767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5536, + "grad_norm": 2.836287021636963, + "kl": 1.4697507545351982, + "learning_rate": 4.23507081857981e-07, + "loss": 0.147, + "num_tokens": 23923457.0, + "reward": 0.7679443359375, + "reward_std": 0.012394722551107407, + "rewards//mean": 0.7679443359375, + "rewards//std": 0.02768895961344242, + "step": 2768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5538, + "grad_norm": 6.46589994430542, + "kl": 2.4142470490187407, + "learning_rate": 4.2319350021165036e-07, + "loss": 0.2414, + "num_tokens": 23931985.0, + "reward": 0.7496337890625, + "reward_std": 0.014223143458366394, + "rewards//mean": 0.7496337890625, + "rewards//std": 0.022159680724143982, + "step": 2769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.554, + "grad_norm": 3.4517600536346436, + "kl": 1.5403327569365501, + "learning_rate": 4.2287994950296844e-07, + "loss": 0.154, + "num_tokens": 23940649.0, + "reward": 0.7445068359375, + "reward_std": 0.007558787241578102, + "rewards//mean": 0.7445068359375, + "rewards//std": 0.020725151523947716, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5542, + "grad_norm": 5.8898749351501465, + "kl": 2.180416490882635, + "learning_rate": 4.2256642985823387e-07, + "loss": 0.218, + "num_tokens": 23949337.0, + "reward": 0.77337646484375, + "reward_std": 0.006437809206545353, + "rewards//mean": 0.77337646484375, + "rewards//std": 0.020225655287504196, + "step": 2771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5544, + "grad_norm": 6.370865821838379, + "kl": 2.0436097718775272, + "learning_rate": 4.222529414037319e-07, + "loss": 0.2044, + "num_tokens": 23957945.0, + "reward": 0.77178955078125, + "reward_std": 0.01296340674161911, + "rewards//mean": 0.77178955078125, + "rewards//std": 0.030574647709727287, + "step": 2772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5546, + "grad_norm": 16.922809600830078, + "kl": 2.2894393298774958, + "learning_rate": 4.21939484265736e-07, + "loss": 0.2289, + "num_tokens": 23966609.0, + "reward": 0.76287841796875, + "reward_std": 0.008009975776076317, + "rewards//mean": 0.76287841796875, + "rewards//std": 0.03232944384217262, + "step": 2773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5548, + "grad_norm": 3.4866135120391846, + "kl": 1.5605244915932417, + "learning_rate": 4.216260585705064e-07, + "loss": 0.1561, + "num_tokens": 23975337.0, + "reward": 0.75836181640625, + "reward_std": 0.010747031308710575, + "rewards//mean": 0.75836181640625, + "rewards//std": 0.033020202070474625, + "step": 2774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.555, + "grad_norm": 4.202916622161865, + "kl": 1.8807479944080114, + "learning_rate": 4.2131266444429105e-07, + "loss": 0.1881, + "num_tokens": 23984009.0, + "reward": 0.7633056640625, + "reward_std": 0.015597840771079063, + "rewards//mean": 0.7633056640625, + "rewards//std": 0.034356385469436646, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5552, + "grad_norm": 1.7025935649871826, + "kl": 0.6696557179093361, + "learning_rate": 4.20999302013325e-07, + "loss": 0.067, + "num_tokens": 23992601.0, + "reward": 0.726318359375, + "reward_std": 0.002489755628630519, + "rewards//mean": 0.726318359375, + "rewards//std": 0.02759394235908985, + "step": 2776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5554, + "grad_norm": 5.02734899520874, + "kl": 1.4180385023355484, + "learning_rate": 4.206859714038308e-07, + "loss": 0.1418, + "num_tokens": 24001281.0, + "reward": 0.78277587890625, + "reward_std": 0.0077003901824355125, + "rewards//mean": 0.78277587890625, + "rewards//std": 0.026826143264770508, + "step": 2777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5556, + "grad_norm": 3.4935340881347656, + "kl": 1.333476446568966, + "learning_rate": 4.203726727420178e-07, + "loss": 0.1333, + "num_tokens": 24009905.0, + "reward": 0.77325439453125, + "reward_std": 0.0075424788519740105, + "rewards//mean": 0.77325439453125, + "rewards//std": 0.02262169122695923, + "step": 2778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5558, + "grad_norm": 3.974247932434082, + "kl": 1.6538397278636694, + "learning_rate": 4.200594061540826e-07, + "loss": 0.1654, + "num_tokens": 24018617.0, + "reward": 0.73565673828125, + "reward_std": 0.0073721082881093025, + "rewards//mean": 0.73565673828125, + "rewards//std": 0.03266019746661186, + "step": 2779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.556, + "grad_norm": 1.8046237230300903, + "kl": 0.6415868848562241, + "learning_rate": 4.1974617176620913e-07, + "loss": 0.0642, + "num_tokens": 24027273.0, + "reward": 0.76385498046875, + "reward_std": 0.00446641631424427, + "rewards//mean": 0.76385498046875, + "rewards//std": 0.017865875735878944, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5562, + "grad_norm": 1.5222598314285278, + "kl": 1.4350633136928082, + "learning_rate": 4.19432969704568e-07, + "loss": 0.1435, + "num_tokens": 24035825.0, + "reward": 0.73199462890625, + "reward_std": 0.010021761991083622, + "rewards//mean": 0.73199462890625, + "rewards//std": 0.032020337879657745, + "step": 2781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5564, + "grad_norm": 3.9649484157562256, + "kl": 0.9754351768642664, + "learning_rate": 4.191198000953171e-07, + "loss": 0.0975, + "num_tokens": 24044497.0, + "reward": 0.7608642578125, + "reward_std": 0.00828640814870596, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.02932657115161419, + "step": 2782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5566, + "grad_norm": 2.895944595336914, + "kl": 1.4642733316868544, + "learning_rate": 4.188066630646009e-07, + "loss": 0.1464, + "num_tokens": 24053153.0, + "reward": 0.78369140625, + "reward_std": 0.01062181405723095, + "rewards//mean": 0.78369140625, + "rewards//std": 0.0258315559476614, + "step": 2783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5568, + "grad_norm": 2.1017825603485107, + "kl": 1.2349791154265404, + "learning_rate": 4.184935587385513e-07, + "loss": 0.1235, + "num_tokens": 24061809.0, + "reward": 0.72265625, + "reward_std": 0.007621363736689091, + "rewards//mean": 0.72265625, + "rewards//std": 0.03456219285726547, + "step": 2784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.557, + "grad_norm": 8.931374549865723, + "kl": 2.0015293676406145, + "learning_rate": 4.1818048724328636e-07, + "loss": 0.2002, + "num_tokens": 24070425.0, + "reward": 0.766845703125, + "reward_std": 0.013596789911389351, + "rewards//mean": 0.766845703125, + "rewards//std": 0.03824716433882713, + "step": 2785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5572, + "grad_norm": 2.1834824085235596, + "kl": 1.1157512124627829, + "learning_rate": 4.1786744870491154e-07, + "loss": 0.1116, + "num_tokens": 24079097.0, + "reward": 0.7518310546875, + "reward_std": 0.009738964028656483, + "rewards//mean": 0.7518310546875, + "rewards//std": 0.03438633307814598, + "step": 2786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5574, + "grad_norm": 4.437238693237305, + "kl": 1.0247641801834106, + "learning_rate": 4.175544432495184e-07, + "loss": 0.1025, + "num_tokens": 24087729.0, + "reward": 0.77655029296875, + "reward_std": 0.008339803665876389, + "rewards//mean": 0.77655029296875, + "rewards//std": 0.025028465315699577, + "step": 2787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5576, + "grad_norm": 4.778684139251709, + "kl": 1.914508005604148, + "learning_rate": 4.1724147100318573e-07, + "loss": 0.1915, + "num_tokens": 24096361.0, + "reward": 0.76898193359375, + "reward_std": 0.013479089364409447, + "rewards//mean": 0.76898193359375, + "rewards//std": 0.04052846506237984, + "step": 2788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5578, + "grad_norm": 3.8494958877563477, + "kl": 1.5572038814425468, + "learning_rate": 4.169285320919786e-07, + "loss": 0.1557, + "num_tokens": 24104937.0, + "reward": 0.72491455078125, + "reward_std": 0.010252334177494049, + "rewards//mean": 0.72491455078125, + "rewards//std": 0.03938855230808258, + "step": 2789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.558, + "grad_norm": 3.860703945159912, + "kl": 1.4313068371266127, + "learning_rate": 4.166156266419489e-07, + "loss": 0.1431, + "num_tokens": 24113553.0, + "reward": 0.75933837890625, + "reward_std": 0.009340978227555752, + "rewards//mean": 0.75933837890625, + "rewards//std": 0.029762277379631996, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5582, + "grad_norm": 4.098750114440918, + "kl": 1.4357947539538145, + "learning_rate": 4.1630275477913465e-07, + "loss": 0.1436, + "num_tokens": 24122097.0, + "reward": 0.7913818359375, + "reward_std": 0.013172097504138947, + "rewards//mean": 0.7913818359375, + "rewards//std": 0.03868066519498825, + "step": 2791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5584, + "grad_norm": 3.8933193683624268, + "kl": 0.6361732296645641, + "learning_rate": 4.1598991662956096e-07, + "loss": 0.0636, + "num_tokens": 24130673.0, + "reward": 0.76898193359375, + "reward_std": 0.004527666140347719, + "rewards//mean": 0.76898193359375, + "rewards//std": 0.02177542634308338, + "step": 2792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5586, + "grad_norm": 9.294544219970703, + "kl": 2.5912675485014915, + "learning_rate": 4.1567711231923876e-07, + "loss": 0.2591, + "num_tokens": 24139281.0, + "reward": 0.77349853515625, + "reward_std": 0.015198972076177597, + "rewards//mean": 0.77349853515625, + "rewards//std": 0.040776461362838745, + "step": 2793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5588, + "grad_norm": 3.0781607627868652, + "kl": 1.6668838188052177, + "learning_rate": 4.1536434197416556e-07, + "loss": 0.1667, + "num_tokens": 24147897.0, + "reward": 0.77252197265625, + "reward_std": 0.01639275997877121, + "rewards//mean": 0.77252197265625, + "rewards//std": 0.027592778205871582, + "step": 2794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.559, + "grad_norm": 4.619749546051025, + "kl": 2.0685927364975214, + "learning_rate": 4.1505160572032534e-07, + "loss": 0.2069, + "num_tokens": 24156497.0, + "reward": 0.7215576171875, + "reward_std": 0.014024253934621811, + "rewards//mean": 0.7215576171875, + "rewards//std": 0.03543053939938545, + "step": 2795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5592, + "grad_norm": 7.09420919418335, + "kl": 1.9501712955534458, + "learning_rate": 4.1473890368368805e-07, + "loss": 0.195, + "num_tokens": 24165153.0, + "reward": 0.741943359375, + "reward_std": 0.009855777956545353, + "rewards//mean": 0.741943359375, + "rewards//std": 0.030606873333454132, + "step": 2796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5594, + "grad_norm": 4.554149150848389, + "kl": 1.0774493645876646, + "learning_rate": 4.1442623599021035e-07, + "loss": 0.1077, + "num_tokens": 24173881.0, + "reward": 0.787353515625, + "reward_std": 0.008499134331941605, + "rewards//mean": 0.787353515625, + "rewards//std": 0.018425745889544487, + "step": 2797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5596, + "grad_norm": 6.377016544342041, + "kl": 2.04929381608963, + "learning_rate": 4.141136027658344e-07, + "loss": 0.2049, + "num_tokens": 24182577.0, + "reward": 0.72589111328125, + "reward_std": 0.011406106874346733, + "rewards//mean": 0.72589111328125, + "rewards//std": 0.03187582269310951, + "step": 2798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5598, + "grad_norm": 3.8820698261260986, + "kl": 1.017936972901225, + "learning_rate": 4.138010041364891e-07, + "loss": 0.1018, + "num_tokens": 24191169.0, + "reward": 0.756103515625, + "reward_std": 0.006095568649470806, + "rewards//mean": 0.756103515625, + "rewards//std": 0.02341553010046482, + "step": 2799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.56, + "grad_norm": 8.252959251403809, + "kl": 1.483777655288577, + "learning_rate": 4.134884402280889e-07, + "loss": 0.1484, + "num_tokens": 24199777.0, + "reward": 0.78045654296875, + "reward_std": 0.010169846005737782, + "rewards//mean": 0.78045654296875, + "rewards//std": 0.037319622933864594, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5602, + "grad_norm": 2.658522844314575, + "kl": 1.7739341259002686, + "learning_rate": 4.131759111665348e-07, + "loss": 0.1774, + "num_tokens": 24208409.0, + "reward": 0.764404296875, + "reward_std": 0.01567518524825573, + "rewards//mean": 0.764404296875, + "rewards//std": 0.03387473151087761, + "step": 2801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5604, + "grad_norm": 6.474039554595947, + "kl": 0.7814014330506325, + "learning_rate": 4.128634170777132e-07, + "loss": 0.0781, + "num_tokens": 24217073.0, + "reward": 0.785400390625, + "reward_std": 0.0030888125766068697, + "rewards//mean": 0.785400390625, + "rewards//std": 0.012962575070559978, + "step": 2802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5606, + "grad_norm": 1.079495906829834, + "kl": 0.6904963739216328, + "learning_rate": 4.1255095808749687e-07, + "loss": 0.069, + "num_tokens": 24225657.0, + "reward": 0.75750732421875, + "reward_std": 0.0038325032219290733, + "rewards//mean": 0.75750732421875, + "rewards//std": 0.024342887103557587, + "step": 2803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5608, + "grad_norm": 7.227648735046387, + "kl": 1.799388937652111, + "learning_rate": 4.12238534321744e-07, + "loss": 0.1799, + "num_tokens": 24234409.0, + "reward": 0.75433349609375, + "reward_std": 0.006956029683351517, + "rewards//mean": 0.75433349609375, + "rewards//std": 0.027996886521577835, + "step": 2804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.561, + "grad_norm": 3.453542947769165, + "kl": 1.7165628597140312, + "learning_rate": 4.1192614590629916e-07, + "loss": 0.1717, + "num_tokens": 24243057.0, + "reward": 0.73968505859375, + "reward_std": 0.012546703219413757, + "rewards//mean": 0.73968505859375, + "rewards//std": 0.03828192874789238, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5612, + "grad_norm": 5.403055667877197, + "kl": 1.1727212518453598, + "learning_rate": 4.1161379296699204e-07, + "loss": 0.1173, + "num_tokens": 24251689.0, + "reward": 0.74224853515625, + "reward_std": 0.009701358154416084, + "rewards//mean": 0.74224853515625, + "rewards//std": 0.03959515690803528, + "step": 2806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5614, + "grad_norm": 3.401139974594116, + "kl": 0.9832586701959372, + "learning_rate": 4.113014756296388e-07, + "loss": 0.0983, + "num_tokens": 24260289.0, + "reward": 0.74462890625, + "reward_std": 0.007674938999116421, + "rewards//mean": 0.74462890625, + "rewards//std": 0.020045317709445953, + "step": 2807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5616, + "grad_norm": 3.884908437728882, + "kl": 0.8709667362272739, + "learning_rate": 4.1098919402004037e-07, + "loss": 0.0871, + "num_tokens": 24268897.0, + "reward": 0.73114013671875, + "reward_std": 0.008018376305699348, + "rewards//mean": 0.73114013671875, + "rewards//std": 0.03120000660419464, + "step": 2808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5618, + "grad_norm": 3.8774807453155518, + "kl": 1.762805551290512, + "learning_rate": 4.1067694826398403e-07, + "loss": 0.1763, + "num_tokens": 24277489.0, + "reward": 0.76251220703125, + "reward_std": 0.010848302394151688, + "rewards//mean": 0.76251220703125, + "rewards//std": 0.02823861502110958, + "step": 2809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.562, + "grad_norm": 1.7476869821548462, + "kl": 1.3065430168062449, + "learning_rate": 4.1036473848724227e-07, + "loss": 0.1307, + "num_tokens": 24286169.0, + "reward": 0.73992919921875, + "reward_std": 0.008076776750385761, + "rewards//mean": 0.73992919921875, + "rewards//std": 0.0273287296295166, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5622, + "grad_norm": 5.053516864776611, + "kl": 1.5737548377364874, + "learning_rate": 4.1005256481557306e-07, + "loss": 0.1574, + "num_tokens": 24294849.0, + "reward": 0.75518798828125, + "reward_std": 0.00939989648759365, + "rewards//mean": 0.75518798828125, + "rewards//std": 0.02415059693157673, + "step": 2811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5624, + "grad_norm": 8.408778190612793, + "kl": 0.8559587094932795, + "learning_rate": 4.0974042737472005e-07, + "loss": 0.0856, + "num_tokens": 24303537.0, + "reward": 0.75982666015625, + "reward_std": 0.002607045928016305, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.015593846328556538, + "step": 2812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5626, + "grad_norm": 2.443895101547241, + "kl": 0.9473502356559038, + "learning_rate": 4.0942832629041197e-07, + "loss": 0.0947, + "num_tokens": 24312097.0, + "reward": 0.76556396484375, + "reward_std": 0.006246202625334263, + "rewards//mean": 0.76556396484375, + "rewards//std": 0.028386715799570084, + "step": 2813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5628, + "grad_norm": 3.7463016510009766, + "kl": 1.749101446941495, + "learning_rate": 4.0911626168836334e-07, + "loss": 0.1749, + "num_tokens": 24320825.0, + "reward": 0.74114990234375, + "reward_std": 0.010974368080496788, + "rewards//mean": 0.74114990234375, + "rewards//std": 0.039032988250255585, + "step": 2814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.563, + "grad_norm": 1.3205562829971313, + "kl": 1.0421797707676888, + "learning_rate": 4.0880423369427353e-07, + "loss": 0.1042, + "num_tokens": 24329505.0, + "reward": 0.78289794921875, + "reward_std": 0.007120010443031788, + "rewards//mean": 0.78289794921875, + "rewards//std": 0.026151226833462715, + "step": 2815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5632, + "grad_norm": 2.8591294288635254, + "kl": 1.3801924027502537, + "learning_rate": 4.084922424338276e-07, + "loss": 0.138, + "num_tokens": 24338065.0, + "reward": 0.75091552734375, + "reward_std": 0.011316141113638878, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.03359376639127731, + "step": 2816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5634, + "grad_norm": 1.1285207271575928, + "kl": 0.5847410652786493, + "learning_rate": 4.0818028803269545e-07, + "loss": 0.0585, + "num_tokens": 24346769.0, + "reward": 0.77447509765625, + "reward_std": 0.0025895023718476295, + "rewards//mean": 0.77447509765625, + "rewards//std": 0.0297021996229887, + "step": 2817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5636, + "grad_norm": 2.443070650100708, + "kl": 0.7588466126471758, + "learning_rate": 4.078683706165323e-07, + "loss": 0.0759, + "num_tokens": 24355441.0, + "reward": 0.7156982421875, + "reward_std": 0.006113075651228428, + "rewards//mean": 0.7156982421875, + "rewards//std": 0.03685128688812256, + "step": 2818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5638, + "grad_norm": 2.839108467102051, + "kl": 1.3912135288119316, + "learning_rate": 4.075564903109784e-07, + "loss": 0.1391, + "num_tokens": 24364321.0, + "reward": 0.7515869140625, + "reward_std": 0.009785149246454239, + "rewards//mean": 0.7515869140625, + "rewards//std": 0.043707676231861115, + "step": 2819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.564, + "grad_norm": 3.3456649780273438, + "kl": 1.2809899915009737, + "learning_rate": 4.072446472416592e-07, + "loss": 0.1281, + "num_tokens": 24372953.0, + "reward": 0.76434326171875, + "reward_std": 0.00723646255210042, + "rewards//mean": 0.76434326171875, + "rewards//std": 0.02247803285717964, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5642, + "grad_norm": 3.5987629890441895, + "kl": 0.9464195091277361, + "learning_rate": 4.0693284153418497e-07, + "loss": 0.0946, + "num_tokens": 24381633.0, + "reward": 0.77734375, + "reward_std": 0.005104473326355219, + "rewards//mean": 0.77734375, + "rewards//std": 0.019512640312314034, + "step": 2821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5644, + "grad_norm": 2.3896987438201904, + "kl": 1.1559174861758947, + "learning_rate": 4.0662107331415107e-07, + "loss": 0.1156, + "num_tokens": 24390217.0, + "reward": 0.736328125, + "reward_std": 0.010888107120990753, + "rewards//mean": 0.736328125, + "rewards//std": 0.029732827097177505, + "step": 2822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5646, + "grad_norm": 2.9707303047180176, + "kl": 1.1775026246905327, + "learning_rate": 4.0630934270713755e-07, + "loss": 0.1178, + "num_tokens": 24398825.0, + "reward": 0.7340087890625, + "reward_std": 0.007471631281077862, + "rewards//mean": 0.7340087890625, + "rewards//std": 0.022409658879041672, + "step": 2823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5648, + "grad_norm": 2.0847198963165283, + "kl": 1.3811526708304882, + "learning_rate": 4.0599764983870974e-07, + "loss": 0.1381, + "num_tokens": 24407393.0, + "reward": 0.75537109375, + "reward_std": 0.009396232664585114, + "rewards//mean": 0.75537109375, + "rewards//std": 0.02411492168903351, + "step": 2824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.565, + "grad_norm": 3.7436866760253906, + "kl": 1.600894445553422, + "learning_rate": 4.0568599483441745e-07, + "loss": 0.1601, + "num_tokens": 24416049.0, + "reward": 0.76202392578125, + "reward_std": 0.014135611243546009, + "rewards//mean": 0.76202392578125, + "rewards//std": 0.030562762171030045, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5652, + "grad_norm": 1.5653101205825806, + "kl": 1.2630480732768774, + "learning_rate": 4.0537437781979505e-07, + "loss": 0.1263, + "num_tokens": 24424681.0, + "reward": 0.75469970703125, + "reward_std": 0.006195409689098597, + "rewards//mean": 0.75469970703125, + "rewards//std": 0.023521877825260162, + "step": 2826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5654, + "grad_norm": 1.5347360372543335, + "kl": 1.6917649414390326, + "learning_rate": 4.0506279892036185e-07, + "loss": 0.1692, + "num_tokens": 24433297.0, + "reward": 0.73626708984375, + "reward_std": 0.009380525909364223, + "rewards//mean": 0.73626708984375, + "rewards//std": 0.03690069913864136, + "step": 2827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5656, + "grad_norm": 2.1606030464172363, + "kl": 1.2863441314548254, + "learning_rate": 4.0475125826162193e-07, + "loss": 0.1286, + "num_tokens": 24441945.0, + "reward": 0.76531982421875, + "reward_std": 0.005815101321786642, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.023055173456668854, + "step": 2828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5658, + "grad_norm": 3.7162234783172607, + "kl": 1.1622996758669615, + "learning_rate": 4.0443975596906376e-07, + "loss": 0.1162, + "num_tokens": 24450609.0, + "reward": 0.7628173828125, + "reward_std": 0.011923268437385559, + "rewards//mean": 0.7628173828125, + "rewards//std": 0.03326575458049774, + "step": 2829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.566, + "grad_norm": 1.3449400663375854, + "kl": 0.911068506538868, + "learning_rate": 4.041282921681605e-07, + "loss": 0.0911, + "num_tokens": 24459257.0, + "reward": 0.7657470703125, + "reward_std": 0.005179694388061762, + "rewards//mean": 0.7657470703125, + "rewards//std": 0.0270092636346817, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5662, + "grad_norm": 4.31550931930542, + "kl": 1.8549344819039106, + "learning_rate": 4.038168669843697e-07, + "loss": 0.1855, + "num_tokens": 24467857.0, + "reward": 0.751953125, + "reward_std": 0.0134589783847332, + "rewards//mean": 0.751953125, + "rewards//std": 0.027357034385204315, + "step": 2831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5664, + "grad_norm": 4.9253740310668945, + "kl": 1.804510459303856, + "learning_rate": 4.0350548054313336e-07, + "loss": 0.1805, + "num_tokens": 24476505.0, + "reward": 0.7681884765625, + "reward_std": 0.01267233956605196, + "rewards//mean": 0.7681884765625, + "rewards//std": 0.02763204462826252, + "step": 2832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5666, + "grad_norm": 3.8080976009368896, + "kl": 2.193727772682905, + "learning_rate": 4.031941329698778e-07, + "loss": 0.2194, + "num_tokens": 24485129.0, + "reward": 0.7828369140625, + "reward_std": 0.022050779312849045, + "rewards//mean": 0.7828369140625, + "rewards//std": 0.03590081259608269, + "step": 2833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5668, + "grad_norm": 1.5609924793243408, + "kl": 1.025518273934722, + "learning_rate": 4.028828243900141e-07, + "loss": 0.1026, + "num_tokens": 24493857.0, + "reward": 0.7818603515625, + "reward_std": 0.004524230025708675, + "rewards//mean": 0.7818603515625, + "rewards//std": 0.02469763532280922, + "step": 2834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.567, + "grad_norm": 2.7725160121917725, + "kl": 0.8214268423616886, + "learning_rate": 4.02571554928937e-07, + "loss": 0.0821, + "num_tokens": 24502465.0, + "reward": 0.75665283203125, + "reward_std": 0.004812084138393402, + "rewards//mean": 0.75665283203125, + "rewards//std": 0.03713420033454895, + "step": 2835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5672, + "grad_norm": 3.326720952987671, + "kl": 2.08671597763896, + "learning_rate": 4.0226032471202597e-07, + "loss": 0.2087, + "num_tokens": 24511161.0, + "reward": 0.7703857421875, + "reward_std": 0.012206031009554863, + "rewards//mean": 0.7703857421875, + "rewards//std": 0.024914879351854324, + "step": 2836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5674, + "grad_norm": 7.801431655883789, + "kl": 2.1236249394714832, + "learning_rate": 4.019491338646444e-07, + "loss": 0.2124, + "num_tokens": 24519913.0, + "reward": 0.72723388671875, + "reward_std": 0.005402359180152416, + "rewards//mean": 0.72723388671875, + "rewards//std": 0.02962309867143631, + "step": 2837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5676, + "grad_norm": 1.6378487348556519, + "kl": 0.8963425159454346, + "learning_rate": 4.016379825121401e-07, + "loss": 0.0896, + "num_tokens": 24528489.0, + "reward": 0.74993896484375, + "reward_std": 0.005480306223034859, + "rewards//mean": 0.74993896484375, + "rewards//std": 0.020885493606328964, + "step": 2838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5678, + "grad_norm": 2.72200083732605, + "kl": 1.2552668061107397, + "learning_rate": 4.013268707798447e-07, + "loss": 0.1255, + "num_tokens": 24537057.0, + "reward": 0.7188720703125, + "reward_std": 0.009600143879652023, + "rewards//mean": 0.7188720703125, + "rewards//std": 0.03693990409374237, + "step": 2839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.568, + "grad_norm": 3.2289915084838867, + "kl": 1.714672639966011, + "learning_rate": 4.010157987930738e-07, + "loss": 0.1715, + "num_tokens": 24545657.0, + "reward": 0.77862548828125, + "reward_std": 0.012070175260305405, + "rewards//mean": 0.77862548828125, + "rewards//std": 0.04071442037820816, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5682, + "grad_norm": 3.4576942920684814, + "kl": 1.6265817675739527, + "learning_rate": 4.0070476667712736e-07, + "loss": 0.1627, + "num_tokens": 24554265.0, + "reward": 0.78662109375, + "reward_std": 0.014543876051902771, + "rewards//mean": 0.78662109375, + "rewards//std": 0.028633149340748787, + "step": 2841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5684, + "grad_norm": 3.6974987983703613, + "kl": 0.8912419956177473, + "learning_rate": 4.00393774557289e-07, + "loss": 0.0891, + "num_tokens": 24562817.0, + "reward": 0.78076171875, + "reward_std": 0.003405903000384569, + "rewards//mean": 0.78076171875, + "rewards//std": 0.022418100386857986, + "step": 2842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5686, + "grad_norm": 1.361642599105835, + "kl": 0.7872443608939648, + "learning_rate": 4.000828225588264e-07, + "loss": 0.0787, + "num_tokens": 24571473.0, + "reward": 0.7686767578125, + "reward_std": 0.002056740690022707, + "rewards//mean": 0.7686767578125, + "rewards//std": 0.012223916128277779, + "step": 2843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5688, + "grad_norm": 7.359623432159424, + "kl": 1.8716084714978933, + "learning_rate": 3.9977191080699087e-07, + "loss": 0.1872, + "num_tokens": 24580073.0, + "reward": 0.78131103515625, + "reward_std": 0.015632355585694313, + "rewards//mean": 0.78131103515625, + "rewards//std": 0.026424750685691833, + "step": 2844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.569, + "grad_norm": 1.366904854774475, + "kl": 0.872997734695673, + "learning_rate": 3.9946103942701775e-07, + "loss": 0.0873, + "num_tokens": 24588689.0, + "reward": 0.75115966796875, + "reward_std": 0.008565717376768589, + "rewards//mean": 0.75115966796875, + "rewards//std": 0.02437644451856613, + "step": 2845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5692, + "grad_norm": 2.9693143367767334, + "kl": 1.4123916625976562, + "learning_rate": 3.9915020854412585e-07, + "loss": 0.1412, + "num_tokens": 24597289.0, + "reward": 0.76446533203125, + "reward_std": 0.006671492010354996, + "rewards//mean": 0.76446533203125, + "rewards//std": 0.028234325349330902, + "step": 2846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5694, + "grad_norm": 0.9857406616210938, + "kl": 0.6219523102045059, + "learning_rate": 3.9883941828351796e-07, + "loss": 0.0622, + "num_tokens": 24605905.0, + "reward": 0.74749755859375, + "reward_std": 0.0027418939862400293, + "rewards//mean": 0.74749755859375, + "rewards//std": 0.027218278497457504, + "step": 2847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5696, + "grad_norm": 4.938109397888184, + "kl": 2.1172143816947937, + "learning_rate": 3.9852866877038017e-07, + "loss": 0.2117, + "num_tokens": 24614473.0, + "reward": 0.734130859375, + "reward_std": 0.022240901365876198, + "rewards//mean": 0.734130859375, + "rewards//std": 0.04406176134943962, + "step": 2848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5698, + "grad_norm": 2.2933638095855713, + "kl": 2.1862348094582558, + "learning_rate": 3.9821796012988264e-07, + "loss": 0.2186, + "num_tokens": 24623105.0, + "reward": 0.74151611328125, + "reward_std": 0.014641113579273224, + "rewards//mean": 0.74151611328125, + "rewards//std": 0.0406101830303669, + "step": 2849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.57, + "grad_norm": 0.939446210861206, + "kl": 1.2345572263002396, + "learning_rate": 3.9790729248717843e-07, + "loss": 0.1235, + "num_tokens": 24631713.0, + "reward": 0.7498779296875, + "reward_std": 0.005368844140321016, + "rewards//mean": 0.7498779296875, + "rewards//std": 0.029708124697208405, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5702, + "grad_norm": 3.2941882610321045, + "kl": 1.2329635936766863, + "learning_rate": 3.9759666596740473e-07, + "loss": 0.1233, + "num_tokens": 24640313.0, + "reward": 0.74005126953125, + "reward_std": 0.010258413851261139, + "rewards//mean": 0.74005126953125, + "rewards//std": 0.03465268388390541, + "step": 2851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5704, + "grad_norm": 1.8356871604919434, + "kl": 1.4185469951480627, + "learning_rate": 3.972860806956816e-07, + "loss": 0.1419, + "num_tokens": 24648905.0, + "reward": 0.749267578125, + "reward_std": 0.0026642712764441967, + "rewards//mean": 0.749267578125, + "rewards//std": 0.026188896968960762, + "step": 2852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5706, + "grad_norm": 2.6941213607788086, + "kl": 1.0374937299638987, + "learning_rate": 3.9697553679711307e-07, + "loss": 0.1037, + "num_tokens": 24657569.0, + "reward": 0.777587890625, + "reward_std": 0.007803292479366064, + "rewards//mean": 0.777587890625, + "rewards//std": 0.030693797394633293, + "step": 2853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5708, + "grad_norm": 2.187129259109497, + "kl": 1.7105520255863667, + "learning_rate": 3.9666503439678576e-07, + "loss": 0.1711, + "num_tokens": 24666257.0, + "reward": 0.76654052734375, + "reward_std": 0.00882560946047306, + "rewards//mean": 0.76654052734375, + "rewards//std": 0.01872391812503338, + "step": 2854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.571, + "grad_norm": 8.421218872070312, + "kl": 2.36005019582808, + "learning_rate": 3.9635457361977045e-07, + "loss": 0.236, + "num_tokens": 24674817.0, + "reward": 0.73699951171875, + "reward_std": 0.01037693116813898, + "rewards//mean": 0.73699951171875, + "rewards//std": 0.024370234459638596, + "step": 2855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5712, + "grad_norm": 2.0182571411132812, + "kl": 1.015860551968217, + "learning_rate": 3.960441545911204e-07, + "loss": 0.1016, + "num_tokens": 24683465.0, + "reward": 0.76348876953125, + "reward_std": 0.005500465165823698, + "rewards//mean": 0.76348876953125, + "rewards//std": 0.02899818681180477, + "step": 2856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5714, + "grad_norm": 1.1454188823699951, + "kl": 0.6423252318054438, + "learning_rate": 3.9573377743587246e-07, + "loss": 0.0642, + "num_tokens": 24692169.0, + "reward": 0.7528076171875, + "reward_std": 0.003434475278481841, + "rewards//mean": 0.7528076171875, + "rewards//std": 0.030004223808646202, + "step": 2857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5716, + "grad_norm": 2.8017640113830566, + "kl": 1.2726697456091642, + "learning_rate": 3.954234422790465e-07, + "loss": 0.1273, + "num_tokens": 24700857.0, + "reward": 0.7440185546875, + "reward_std": 0.011102795600891113, + "rewards//mean": 0.7440185546875, + "rewards//std": 0.039610300213098526, + "step": 2858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5718, + "grad_norm": 2.477567434310913, + "kl": 1.6436695139855146, + "learning_rate": 3.951131492456454e-07, + "loss": 0.1644, + "num_tokens": 24709681.0, + "reward": 0.76910400390625, + "reward_std": 0.011545747518539429, + "rewards//mean": 0.76910400390625, + "rewards//std": 0.0438758060336113, + "step": 2859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.572, + "grad_norm": 1.9772716760635376, + "kl": 0.7613619193434715, + "learning_rate": 3.948028984606554e-07, + "loss": 0.0761, + "num_tokens": 24718273.0, + "reward": 0.76739501953125, + "reward_std": 0.003701980458572507, + "rewards//mean": 0.76739501953125, + "rewards//std": 0.02427937462925911, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5722, + "grad_norm": 3.798412561416626, + "kl": 1.0803110171109438, + "learning_rate": 3.9449269004904516e-07, + "loss": 0.108, + "num_tokens": 24726913.0, + "reward": 0.7674560546875, + "reward_std": 0.012386959046125412, + "rewards//mean": 0.7674560546875, + "rewards//std": 0.029156772419810295, + "step": 2861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5724, + "grad_norm": 3.071577310562134, + "kl": 1.447898956015706, + "learning_rate": 3.941825241357669e-07, + "loss": 0.1448, + "num_tokens": 24735497.0, + "reward": 0.75421142578125, + "reward_std": 0.012044334784150124, + "rewards//mean": 0.75421142578125, + "rewards//std": 0.02970270998775959, + "step": 2862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5726, + "grad_norm": 1.1278499364852905, + "kl": 0.9992974400520325, + "learning_rate": 3.9387240084575514e-07, + "loss": 0.0999, + "num_tokens": 24744201.0, + "reward": 0.76312255859375, + "reward_std": 0.005103797186166048, + "rewards//mean": 0.76312255859375, + "rewards//std": 0.03301653265953064, + "step": 2863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5728, + "grad_norm": 2.3847978115081787, + "kl": 0.9268031772226095, + "learning_rate": 3.935623203039277e-07, + "loss": 0.0927, + "num_tokens": 24752841.0, + "reward": 0.77447509765625, + "reward_std": 0.007537885569036007, + "rewards//mean": 0.77447509765625, + "rewards//std": 0.031259626150131226, + "step": 2864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.573, + "grad_norm": 1.3187930583953857, + "kl": 0.9877217337489128, + "learning_rate": 3.9325228263518484e-07, + "loss": 0.0988, + "num_tokens": 24761497.0, + "reward": 0.747314453125, + "reward_std": 0.00388380978256464, + "rewards//mean": 0.747314453125, + "rewards//std": 0.026501474902033806, + "step": 2865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5732, + "grad_norm": 6.1575822830200195, + "kl": 1.854195710271597, + "learning_rate": 3.9294228796440986e-07, + "loss": 0.1854, + "num_tokens": 24770081.0, + "reward": 0.72894287109375, + "reward_std": 0.010574898682534695, + "rewards//mean": 0.72894287109375, + "rewards//std": 0.0389980711042881, + "step": 2866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5734, + "grad_norm": 2.3770430088043213, + "kl": 1.3740087524056435, + "learning_rate": 3.9263233641646836e-07, + "loss": 0.1374, + "num_tokens": 24778665.0, + "reward": 0.78656005859375, + "reward_std": 0.009839614853262901, + "rewards//mean": 0.78656005859375, + "rewards//std": 0.032692164182662964, + "step": 2867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5736, + "grad_norm": 1.6476397514343262, + "kl": 1.2667736951261759, + "learning_rate": 3.923224281162091e-07, + "loss": 0.1267, + "num_tokens": 24787289.0, + "reward": 0.76092529296875, + "reward_std": 0.01032093446701765, + "rewards//mean": 0.76092529296875, + "rewards//std": 0.03257201611995697, + "step": 2868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5738, + "grad_norm": 4.1392083168029785, + "kl": 2.292639434337616, + "learning_rate": 3.920125631884627e-07, + "loss": 0.2293, + "num_tokens": 24795889.0, + "reward": 0.75299072265625, + "reward_std": 0.01150448713451624, + "rewards//mean": 0.75299072265625, + "rewards//std": 0.03370174020528793, + "step": 2869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.574, + "grad_norm": 1.0720202922821045, + "kl": 1.0931424088776112, + "learning_rate": 3.917027417580431e-07, + "loss": 0.1093, + "num_tokens": 24804609.0, + "reward": 0.769775390625, + "reward_std": 0.0047827111557126045, + "rewards//mean": 0.769775390625, + "rewards//std": 0.031657155603170395, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5742, + "grad_norm": 2.7081947326660156, + "kl": 1.3113758526742458, + "learning_rate": 3.913929639497462e-07, + "loss": 0.1311, + "num_tokens": 24813193.0, + "reward": 0.73284912109375, + "reward_std": 0.005696638487279415, + "rewards//mean": 0.73284912109375, + "rewards//std": 0.023964976891875267, + "step": 2871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5744, + "grad_norm": 2.885650396347046, + "kl": 1.3029054496437311, + "learning_rate": 3.910832298883503e-07, + "loss": 0.1303, + "num_tokens": 24821777.0, + "reward": 0.749755859375, + "reward_std": 0.011630555614829063, + "rewards//mean": 0.749755859375, + "rewards//std": 0.032090287655591965, + "step": 2872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5746, + "grad_norm": 10.519225120544434, + "kl": 2.9547567199915648, + "learning_rate": 3.907735396986165e-07, + "loss": 0.2955, + "num_tokens": 24830465.0, + "reward": 0.76666259765625, + "reward_std": 0.015747109428048134, + "rewards//mean": 0.76666259765625, + "rewards//std": 0.03445158898830414, + "step": 2873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5748, + "grad_norm": 2.0334110260009766, + "kl": 1.230003384873271, + "learning_rate": 3.904638935052876e-07, + "loss": 0.123, + "num_tokens": 24839145.0, + "reward": 0.72503662109375, + "reward_std": 0.007793288677930832, + "rewards//mean": 0.72503662109375, + "rewards//std": 0.04294686019420624, + "step": 2874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.575, + "grad_norm": 2.8829345703125, + "kl": 0.9360679723322392, + "learning_rate": 3.9015429143308957e-07, + "loss": 0.0936, + "num_tokens": 24847809.0, + "reward": 0.75347900390625, + "reward_std": 0.006197234615683556, + "rewards//mean": 0.75347900390625, + "rewards//std": 0.029676707461476326, + "step": 2875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5752, + "grad_norm": 4.016130447387695, + "kl": 1.7749811839312315, + "learning_rate": 3.8984473360672967e-07, + "loss": 0.1775, + "num_tokens": 24856521.0, + "reward": 0.7703857421875, + "reward_std": 0.012462177313864231, + "rewards//mean": 0.7703857421875, + "rewards//std": 0.025272004306316376, + "step": 2876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5754, + "grad_norm": 2.8069870471954346, + "kl": 0.9605435188859701, + "learning_rate": 3.89535220150898e-07, + "loss": 0.0961, + "num_tokens": 24865145.0, + "reward": 0.78955078125, + "reward_std": 0.010012742131948471, + "rewards//mean": 0.78955078125, + "rewards//std": 0.024719996377825737, + "step": 2877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5756, + "grad_norm": 1.754363775253296, + "kl": 0.8147229012101889, + "learning_rate": 3.8922575119026635e-07, + "loss": 0.0815, + "num_tokens": 24873785.0, + "reward": 0.75469970703125, + "reward_std": 0.005375477019697428, + "rewards//mean": 0.75469970703125, + "rewards//std": 0.024989726021885872, + "step": 2878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5758, + "grad_norm": 5.300290584564209, + "kl": 3.0344097819179296, + "learning_rate": 3.8891632684948895e-07, + "loss": 0.3034, + "num_tokens": 24882417.0, + "reward": 0.75579833984375, + "reward_std": 0.01360536739230156, + "rewards//mean": 0.75579833984375, + "rewards//std": 0.021931972354650497, + "step": 2879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.576, + "grad_norm": 3.7213964462280273, + "kl": 1.8619213290512562, + "learning_rate": 3.886069472532017e-07, + "loss": 0.1862, + "num_tokens": 24891033.0, + "reward": 0.79443359375, + "reward_std": 0.02071780152618885, + "rewards//mean": 0.79443359375, + "rewards//std": 0.03342577815055847, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5762, + "grad_norm": 3.218445301055908, + "kl": 0.9604776222258806, + "learning_rate": 3.882976125260229e-07, + "loss": 0.096, + "num_tokens": 24899673.0, + "reward": 0.76806640625, + "reward_std": 0.005524271633476019, + "rewards//mean": 0.76806640625, + "rewards//std": 0.027669545263051987, + "step": 2881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5764, + "grad_norm": 0.9913255572319031, + "kl": 1.0727346427738667, + "learning_rate": 3.879883227925523e-07, + "loss": 0.1073, + "num_tokens": 24908321.0, + "reward": 0.72283935546875, + "reward_std": 0.007737959735095501, + "rewards//mean": 0.72283935546875, + "rewards//std": 0.029995836317539215, + "step": 2882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5766, + "grad_norm": 3.475794553756714, + "kl": 1.2210092321038246, + "learning_rate": 3.87679078177372e-07, + "loss": 0.1221, + "num_tokens": 24916969.0, + "reward": 0.7628173828125, + "reward_std": 0.008624376729130745, + "rewards//mean": 0.7628173828125, + "rewards//std": 0.027183566242456436, + "step": 2883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5768, + "grad_norm": 2.521831512451172, + "kl": 1.431194880977273, + "learning_rate": 3.8736987880504546e-07, + "loss": 0.1431, + "num_tokens": 24925673.0, + "reward": 0.75701904296875, + "reward_std": 0.012469821609556675, + "rewards//mean": 0.75701904296875, + "rewards//std": 0.029709843918681145, + "step": 2884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.577, + "grad_norm": 3.478924036026001, + "kl": 0.9622517786920071, + "learning_rate": 3.870607248001184e-07, + "loss": 0.0962, + "num_tokens": 24934401.0, + "reward": 0.70977783203125, + "reward_std": 0.0074232397601008415, + "rewards//mean": 0.70977783203125, + "rewards//std": 0.039797645062208176, + "step": 2885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5772, + "grad_norm": 5.086318492889404, + "kl": 1.6613334342837334, + "learning_rate": 3.8675161628711773e-07, + "loss": 0.1661, + "num_tokens": 24943001.0, + "reward": 0.756591796875, + "reward_std": 0.011422664858400822, + "rewards//mean": 0.756591796875, + "rewards//std": 0.033623550087213516, + "step": 2886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5774, + "grad_norm": 3.9888803958892822, + "kl": 0.9865813702344894, + "learning_rate": 3.8644255339055266e-07, + "loss": 0.0987, + "num_tokens": 24951601.0, + "reward": 0.7633056640625, + "reward_std": 0.006879429332911968, + "rewards//mean": 0.7633056640625, + "rewards//std": 0.02316431887447834, + "step": 2887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5776, + "grad_norm": 3.674445152282715, + "kl": 1.143939208239317, + "learning_rate": 3.861335362349134e-07, + "loss": 0.1144, + "num_tokens": 24960201.0, + "reward": 0.78857421875, + "reward_std": 0.007237609010189772, + "rewards//mean": 0.78857421875, + "rewards//std": 0.037373583763837814, + "step": 2888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5778, + "grad_norm": 3.1411032676696777, + "kl": 1.1831684205681086, + "learning_rate": 3.8582456494467206e-07, + "loss": 0.1183, + "num_tokens": 24968937.0, + "reward": 0.74822998046875, + "reward_std": 0.005086447112262249, + "rewards//mean": 0.74822998046875, + "rewards//std": 0.02242274209856987, + "step": 2889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.578, + "grad_norm": 3.5862295627593994, + "kl": 2.0713862515985966, + "learning_rate": 3.8551563964428247e-07, + "loss": 0.2071, + "num_tokens": 24977481.0, + "reward": 0.770263671875, + "reward_std": 0.010184851475059986, + "rewards//mean": 0.770263671875, + "rewards//std": 0.02881338819861412, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5782, + "grad_norm": 5.6024370193481445, + "kl": 1.8228002339601517, + "learning_rate": 3.852067604581794e-07, + "loss": 0.1823, + "num_tokens": 24986057.0, + "reward": 0.73577880859375, + "reward_std": 0.006137916352599859, + "rewards//mean": 0.73577880859375, + "rewards//std": 0.02537623979151249, + "step": 2891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5784, + "grad_norm": 5.521358013153076, + "kl": 1.2860528007149696, + "learning_rate": 3.848979275107796e-07, + "loss": 0.1286, + "num_tokens": 24994777.0, + "reward": 0.771728515625, + "reward_std": 0.004776159301400185, + "rewards//mean": 0.771728515625, + "rewards//std": 0.03109363093972206, + "step": 2892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5786, + "grad_norm": 2.419461488723755, + "kl": 1.6053543761372566, + "learning_rate": 3.845891409264807e-07, + "loss": 0.1605, + "num_tokens": 25003401.0, + "reward": 0.75799560546875, + "reward_std": 0.006750599481165409, + "rewards//mean": 0.75799560546875, + "rewards//std": 0.025830896571278572, + "step": 2893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5788, + "grad_norm": 11.757281303405762, + "kl": 1.9018959756940603, + "learning_rate": 3.8428040082966217e-07, + "loss": 0.1902, + "num_tokens": 25012001.0, + "reward": 0.74896240234375, + "reward_std": 0.005783796310424805, + "rewards//mean": 0.74896240234375, + "rewards//std": 0.02722606435418129, + "step": 2894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.579, + "grad_norm": 4.887277603149414, + "kl": 2.279204286634922, + "learning_rate": 3.839717073446842e-07, + "loss": 0.2279, + "num_tokens": 25020585.0, + "reward": 0.7481689453125, + "reward_std": 0.010662375018000603, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.03425224497914314, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5792, + "grad_norm": 0.0777413472533226, + "kl": 0.44031156226992607, + "learning_rate": 3.8366306059588876e-07, + "loss": 0.044, + "num_tokens": 25029113.0, + "reward": 0.7607421875, + "reward_std": 0.0, + "rewards//mean": 0.7607421875, + "rewards//std": 0.02944633737206459, + "step": 2896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5794, + "grad_norm": 3.8421013355255127, + "kl": 1.4856318179517984, + "learning_rate": 3.8335446070759855e-07, + "loss": 0.1486, + "num_tokens": 25037881.0, + "reward": 0.76629638671875, + "reward_std": 0.010429825633764267, + "rewards//mean": 0.76629638671875, + "rewards//std": 0.03388673812150955, + "step": 2897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5796, + "grad_norm": 2.8649139404296875, + "kl": 0.914740202948451, + "learning_rate": 3.8304590780411766e-07, + "loss": 0.0915, + "num_tokens": 25046545.0, + "reward": 0.75653076171875, + "reward_std": 0.005120365414768457, + "rewards//mean": 0.75653076171875, + "rewards//std": 0.02172183059155941, + "step": 2898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5798, + "grad_norm": 2.894110918045044, + "kl": 1.1752416342496872, + "learning_rate": 3.8273740200973103e-07, + "loss": 0.1175, + "num_tokens": 25055065.0, + "reward": 0.73858642578125, + "reward_std": 0.005320030730217695, + "rewards//mean": 0.73858642578125, + "rewards//std": 0.032371558248996735, + "step": 2899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.58, + "grad_norm": 1.218570351600647, + "kl": 1.2086050175130367, + "learning_rate": 3.8242894344870495e-07, + "loss": 0.1209, + "num_tokens": 25063729.0, + "reward": 0.73529052734375, + "reward_std": 0.004976191557943821, + "rewards//mean": 0.73529052734375, + "rewards//std": 0.027513664215803146, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5802, + "grad_norm": 1.2146533727645874, + "kl": 1.2067123372107744, + "learning_rate": 3.821205322452863e-07, + "loss": 0.1207, + "num_tokens": 25072337.0, + "reward": 0.7364501953125, + "reward_std": 0.00893724150955677, + "rewards//mean": 0.7364501953125, + "rewards//std": 0.027125589549541473, + "step": 2901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5804, + "grad_norm": 2.632699489593506, + "kl": 1.0411553103476763, + "learning_rate": 3.8181216852370324e-07, + "loss": 0.1041, + "num_tokens": 25080913.0, + "reward": 0.76397705078125, + "reward_std": 0.010115432552993298, + "rewards//mean": 0.76397705078125, + "rewards//std": 0.030645858496427536, + "step": 2902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5806, + "grad_norm": 2.7581443786621094, + "kl": 1.3435988519340754, + "learning_rate": 3.8150385240816455e-07, + "loss": 0.1344, + "num_tokens": 25089449.0, + "reward": 0.751220703125, + "reward_std": 0.010779851116240025, + "rewards//mean": 0.751220703125, + "rewards//std": 0.026774248108267784, + "step": 2903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5808, + "grad_norm": 2.344057083129883, + "kl": 1.7375097181648016, + "learning_rate": 3.811955840228599e-07, + "loss": 0.1738, + "num_tokens": 25098049.0, + "reward": 0.742919921875, + "reward_std": 0.009003902785480022, + "rewards//mean": 0.742919921875, + "rewards//std": 0.026012586429715157, + "step": 2904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.581, + "grad_norm": 6.341611385345459, + "kl": 1.671227691695094, + "learning_rate": 3.808873634919599e-07, + "loss": 0.1671, + "num_tokens": 25106777.0, + "reward": 0.75714111328125, + "reward_std": 0.008980963379144669, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.047763340175151825, + "step": 2905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5812, + "grad_norm": 1.2366228103637695, + "kl": 0.615428002551198, + "learning_rate": 3.805791909396155e-07, + "loss": 0.0615, + "num_tokens": 25115449.0, + "reward": 0.7615966796875, + "reward_std": 0.002178395399823785, + "rewards//mean": 0.7615966796875, + "rewards//std": 0.02757720649242401, + "step": 2906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5814, + "grad_norm": 6.13063907623291, + "kl": 1.0890697482973337, + "learning_rate": 3.8027106648995875e-07, + "loss": 0.1089, + "num_tokens": 25124241.0, + "reward": 0.7391357421875, + "reward_std": 0.005511388182640076, + "rewards//mean": 0.7391357421875, + "rewards//std": 0.02692393958568573, + "step": 2907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5816, + "grad_norm": 1.5383005142211914, + "kl": 1.1366916317492723, + "learning_rate": 3.799629902671021e-07, + "loss": 0.1137, + "num_tokens": 25132953.0, + "reward": 0.74591064453125, + "reward_std": 0.007306367624551058, + "rewards//mean": 0.74591064453125, + "rewards//std": 0.027218835428357124, + "step": 2908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5818, + "grad_norm": 5.29603910446167, + "kl": 1.207690665498376, + "learning_rate": 3.7965496239513874e-07, + "loss": 0.1208, + "num_tokens": 25141569.0, + "reward": 0.761474609375, + "reward_std": 0.00411243224516511, + "rewards//mean": 0.761474609375, + "rewards//std": 0.030152438208460808, + "step": 2909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.582, + "grad_norm": 5.143453121185303, + "kl": 0.7798364870250225, + "learning_rate": 3.7934698299814196e-07, + "loss": 0.078, + "num_tokens": 25150177.0, + "reward": 0.78363037109375, + "reward_std": 0.0010538268834352493, + "rewards//mean": 0.78363037109375, + "rewards//std": 0.02703300304710865, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5822, + "grad_norm": 2.3404674530029297, + "kl": 0.9517956115305424, + "learning_rate": 3.790390522001662e-07, + "loss": 0.0952, + "num_tokens": 25158857.0, + "reward": 0.76251220703125, + "reward_std": 0.006683473940938711, + "rewards//mean": 0.76251220703125, + "rewards//std": 0.02810966596007347, + "step": 2911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5824, + "grad_norm": 2.460749864578247, + "kl": 1.2004188019782305, + "learning_rate": 3.787311701252457e-07, + "loss": 0.12, + "num_tokens": 25167665.0, + "reward": 0.77496337890625, + "reward_std": 0.005581947043538094, + "rewards//mean": 0.77496337890625, + "rewards//std": 0.028886782005429268, + "step": 2912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5826, + "grad_norm": 2.193575382232666, + "kl": 1.251411898061633, + "learning_rate": 3.784233368973952e-07, + "loss": 0.1251, + "num_tokens": 25176241.0, + "reward": 0.756591796875, + "reward_std": 0.011376041918992996, + "rewards//mean": 0.756591796875, + "rewards//std": 0.03623092547059059, + "step": 2913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5828, + "grad_norm": 1.8785489797592163, + "kl": 0.9331819377839565, + "learning_rate": 3.7811555264061024e-07, + "loss": 0.0933, + "num_tokens": 25184929.0, + "reward": 0.759765625, + "reward_std": 0.006695899181067944, + "rewards//mean": 0.759765625, + "rewards//std": 0.024449070915579796, + "step": 2914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.583, + "grad_norm": 1.5429750680923462, + "kl": 0.5793572627007961, + "learning_rate": 3.7780781747886594e-07, + "loss": 0.0579, + "num_tokens": 25193521.0, + "reward": 0.76953125, + "reward_std": 0.004423785954713821, + "rewards//mean": 0.76953125, + "rewards//std": 0.020351096987724304, + "step": 2915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5832, + "grad_norm": 3.18100905418396, + "kl": 0.9547933097928762, + "learning_rate": 3.7750013153611827e-07, + "loss": 0.0955, + "num_tokens": 25202121.0, + "reward": 0.74761962890625, + "reward_std": 0.006088032387197018, + "rewards//mean": 0.74761962890625, + "rewards//std": 0.03412932902574539, + "step": 2916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5834, + "grad_norm": 6.164799690246582, + "kl": 1.2117119263857603, + "learning_rate": 3.7719249493630297e-07, + "loss": 0.1212, + "num_tokens": 25210777.0, + "reward": 0.7418212890625, + "reward_std": 0.012059032917022705, + "rewards//mean": 0.7418212890625, + "rewards//std": 0.031755268573760986, + "step": 2917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5836, + "grad_norm": 3.864914655685425, + "kl": 1.8814155869185925, + "learning_rate": 3.768849078033359e-07, + "loss": 0.1881, + "num_tokens": 25219473.0, + "reward": 0.7552490234375, + "reward_std": 0.015670623630285263, + "rewards//mean": 0.7552490234375, + "rewards//std": 0.031865671277046204, + "step": 2918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5838, + "grad_norm": 4.7242302894592285, + "kl": 1.4014372508972883, + "learning_rate": 3.7657737026111335e-07, + "loss": 0.1401, + "num_tokens": 25228185.0, + "reward": 0.7457275390625, + "reward_std": 0.011918429285287857, + "rewards//mean": 0.7457275390625, + "rewards//std": 0.027645189315080643, + "step": 2919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.584, + "grad_norm": 3.321138381958008, + "kl": 1.0440570916980505, + "learning_rate": 3.762698824335112e-07, + "loss": 0.1044, + "num_tokens": 25236753.0, + "reward": 0.76275634765625, + "reward_std": 0.005436545237898827, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.017665637657046318, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5842, + "grad_norm": 1.736484408378601, + "kl": 0.5573320128023624, + "learning_rate": 3.7596244444438574e-07, + "loss": 0.0557, + "num_tokens": 25245337.0, + "reward": 0.775634765625, + "reward_std": 0.002762136049568653, + "rewards//mean": 0.775634765625, + "rewards//std": 0.029139375314116478, + "step": 2921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5844, + "grad_norm": 1.178229808807373, + "kl": 0.949693102389574, + "learning_rate": 3.7565505641757266e-07, + "loss": 0.095, + "num_tokens": 25253945.0, + "reward": 0.7354736328125, + "reward_std": 0.005426804535090923, + "rewards//mean": 0.7354736328125, + "rewards//std": 0.027502451092004776, + "step": 2922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5846, + "grad_norm": 1.161841630935669, + "kl": 0.9800237398594618, + "learning_rate": 3.7534771847688814e-07, + "loss": 0.098, + "num_tokens": 25262537.0, + "reward": 0.7806396484375, + "reward_std": 0.007429791148751974, + "rewards//mean": 0.7806396484375, + "rewards//std": 0.02438179962337017, + "step": 2923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5848, + "grad_norm": 1.1193159818649292, + "kl": 0.61169115267694, + "learning_rate": 3.750404307461276e-07, + "loss": 0.0612, + "num_tokens": 25271145.0, + "reward": 0.7950439453125, + "reward_std": 0.0022976321633905172, + "rewards//mean": 0.7950439453125, + "rewards//std": 0.023375090211629868, + "step": 2924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.585, + "grad_norm": 1.744962215423584, + "kl": 0.7847408857196569, + "learning_rate": 3.7473319334906673e-07, + "loss": 0.0785, + "num_tokens": 25279809.0, + "reward": 0.746337890625, + "reward_std": 0.006266321055591106, + "rewards//mean": 0.746337890625, + "rewards//std": 0.027751486748456955, + "step": 2925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5852, + "grad_norm": 2.304675579071045, + "kl": 1.5284422002732754, + "learning_rate": 3.744260064094604e-07, + "loss": 0.1528, + "num_tokens": 25288457.0, + "reward": 0.76416015625, + "reward_std": 0.007546101696789265, + "rewards//mean": 0.76416015625, + "rewards//std": 0.024611983448266983, + "step": 2926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5854, + "grad_norm": 1.653802752494812, + "kl": 1.2024875096976757, + "learning_rate": 3.7411887005104395e-07, + "loss": 0.1202, + "num_tokens": 25297113.0, + "reward": 0.760498046875, + "reward_std": 0.008054269477725029, + "rewards//mean": 0.760498046875, + "rewards//std": 0.024178866297006607, + "step": 2927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5856, + "grad_norm": 3.1405227184295654, + "kl": 1.5627470295876265, + "learning_rate": 3.7381178439753135e-07, + "loss": 0.1563, + "num_tokens": 25305833.0, + "reward": 0.763427734375, + "reward_std": 0.010709173046052456, + "rewards//mean": 0.763427734375, + "rewards//std": 0.02998328022658825, + "step": 2928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5858, + "grad_norm": 2.257915735244751, + "kl": 1.6978377625346184, + "learning_rate": 3.73504749572617e-07, + "loss": 0.1698, + "num_tokens": 25314457.0, + "reward": 0.73956298828125, + "reward_std": 0.007842618972063065, + "rewards//mean": 0.73956298828125, + "rewards//std": 0.029537124559283257, + "step": 2929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.586, + "grad_norm": 1.9725359678268433, + "kl": 1.1673318538814783, + "learning_rate": 3.7319776569997434e-07, + "loss": 0.1167, + "num_tokens": 25323097.0, + "reward": 0.7615966796875, + "reward_std": 0.009055625647306442, + "rewards//mean": 0.7615966796875, + "rewards//std": 0.029023557901382446, + "step": 2930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5862, + "grad_norm": 1.5409036874771118, + "kl": 0.5901408772915602, + "learning_rate": 3.728908329032566e-07, + "loss": 0.059, + "num_tokens": 25331673.0, + "reward": 0.7210693359375, + "reward_std": 0.0025444694329053164, + "rewards//mean": 0.7210693359375, + "rewards//std": 0.03489499166607857, + "step": 2931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5864, + "grad_norm": 2.013103723526001, + "kl": 0.7742724735289812, + "learning_rate": 3.7258395130609606e-07, + "loss": 0.0774, + "num_tokens": 25340289.0, + "reward": 0.757080078125, + "reward_std": 0.004097769968211651, + "rewards//mean": 0.757080078125, + "rewards//std": 0.022106675431132317, + "step": 2932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5866, + "grad_norm": 2.6155965328216553, + "kl": 1.7795275654643774, + "learning_rate": 3.722771210321048e-07, + "loss": 0.178, + "num_tokens": 25348977.0, + "reward": 0.794677734375, + "reward_std": 0.01234513521194458, + "rewards//mean": 0.794677734375, + "rewards//std": 0.02354961447417736, + "step": 2933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5868, + "grad_norm": 2.1520025730133057, + "kl": 1.234590107575059, + "learning_rate": 3.719703422048739e-07, + "loss": 0.1235, + "num_tokens": 25357561.0, + "reward": 0.73114013671875, + "reward_std": 0.002951596863567829, + "rewards//mean": 0.73114013671875, + "rewards//std": 0.023689888417720795, + "step": 2934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.587, + "grad_norm": 2.2890708446502686, + "kl": 2.2860290706157684, + "learning_rate": 3.716636149479737e-07, + "loss": 0.2286, + "num_tokens": 25366177.0, + "reward": 0.75006103515625, + "reward_std": 0.017393410205841064, + "rewards//mean": 0.75006103515625, + "rewards//std": 0.03728269413113594, + "step": 2935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5872, + "grad_norm": 4.515402793884277, + "kl": 1.5157401952892542, + "learning_rate": 3.7135693938495426e-07, + "loss": 0.1516, + "num_tokens": 25374777.0, + "reward": 0.76287841796875, + "reward_std": 0.007290535606443882, + "rewards//mean": 0.76287841796875, + "rewards//std": 0.01749255321919918, + "step": 2936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5874, + "grad_norm": 5.70565938949585, + "kl": 0.7400479298084974, + "learning_rate": 3.710503156393441e-07, + "loss": 0.074, + "num_tokens": 25383425.0, + "reward": 0.739013671875, + "reward_std": 0.0020954408682882786, + "rewards//mean": 0.739013671875, + "rewards//std": 0.03246547281742096, + "step": 2937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5876, + "grad_norm": 4.550516605377197, + "kl": 1.8633367139846087, + "learning_rate": 3.7074374383465146e-07, + "loss": 0.1863, + "num_tokens": 25392233.0, + "reward": 0.77117919921875, + "reward_std": 0.012194883078336716, + "rewards//mean": 0.77117919921875, + "rewards//std": 0.030764181166887283, + "step": 2938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5878, + "grad_norm": 7.286697864532471, + "kl": 1.191249007359147, + "learning_rate": 3.704372240943633e-07, + "loss": 0.1191, + "num_tokens": 25400833.0, + "reward": 0.75201416015625, + "reward_std": 0.0027293239254504442, + "rewards//mean": 0.75201416015625, + "rewards//std": 0.02713472582399845, + "step": 2939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.588, + "grad_norm": 3.6624231338500977, + "kl": 2.6978115867823362, + "learning_rate": 3.701307565419458e-07, + "loss": 0.2698, + "num_tokens": 25409489.0, + "reward": 0.72589111328125, + "reward_std": 0.019558951258659363, + "rewards//mean": 0.72589111328125, + "rewards//std": 0.04126688838005066, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5882, + "grad_norm": 2.7452900409698486, + "kl": 0.64956134557724, + "learning_rate": 3.6982434130084396e-07, + "loss": 0.065, + "num_tokens": 25418033.0, + "reward": 0.726806640625, + "reward_std": 0.004223444499075413, + "rewards//mean": 0.726806640625, + "rewards//std": 0.03437862917780876, + "step": 2941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5884, + "grad_norm": 5.2031097412109375, + "kl": 1.8079121690243483, + "learning_rate": 3.69517978494482e-07, + "loss": 0.1808, + "num_tokens": 25426673.0, + "reward": 0.76055908203125, + "reward_std": 0.009141894988715649, + "rewards//mean": 0.76055908203125, + "rewards//std": 0.03465792536735535, + "step": 2942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5886, + "grad_norm": 2.3682773113250732, + "kl": 1.906976232305169, + "learning_rate": 3.6921166824626257e-07, + "loss": 0.1907, + "num_tokens": 25435417.0, + "reward": 0.7525634765625, + "reward_std": 0.014391046948730946, + "rewards//mean": 0.7525634765625, + "rewards//std": 0.03253510594367981, + "step": 2943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5888, + "grad_norm": 5.028013706207275, + "kl": 2.3126772716641426, + "learning_rate": 3.689054106795677e-07, + "loss": 0.2313, + "num_tokens": 25444025.0, + "reward": 0.760498046875, + "reward_std": 0.013229141011834145, + "rewards//mean": 0.760498046875, + "rewards//std": 0.02802937850356102, + "step": 2944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.589, + "grad_norm": 1.7016352415084839, + "kl": 2.3417629953473806, + "learning_rate": 3.685992059177576e-07, + "loss": 0.2342, + "num_tokens": 25452697.0, + "reward": 0.75592041015625, + "reward_std": 0.015161093324422836, + "rewards//mean": 0.75592041015625, + "rewards//std": 0.027268286794424057, + "step": 2945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5892, + "grad_norm": 3.9929282665252686, + "kl": 2.0672084912657738, + "learning_rate": 3.6829305408417166e-07, + "loss": 0.2067, + "num_tokens": 25461385.0, + "reward": 0.75616455078125, + "reward_std": 0.0075363172218203545, + "rewards//mean": 0.75616455078125, + "rewards//std": 0.02570282481610775, + "step": 2946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5894, + "grad_norm": 2.2696990966796875, + "kl": 1.5808081701397896, + "learning_rate": 3.679869553021278e-07, + "loss": 0.1581, + "num_tokens": 25469897.0, + "reward": 0.78125, + "reward_std": 0.009897984564304352, + "rewards//mean": 0.78125, + "rewards//std": 0.01782611384987831, + "step": 2947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5896, + "grad_norm": 2.5548388957977295, + "kl": 1.0580792985856533, + "learning_rate": 3.676809096949226e-07, + "loss": 0.1058, + "num_tokens": 25478505.0, + "reward": 0.76849365234375, + "reward_std": 0.003828746033832431, + "rewards//mean": 0.76849365234375, + "rewards//std": 0.023001927882432938, + "step": 2948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5898, + "grad_norm": 3.381967544555664, + "kl": 1.2999161537736654, + "learning_rate": 3.6737491738583117e-07, + "loss": 0.13, + "num_tokens": 25487137.0, + "reward": 0.7855224609375, + "reward_std": 0.008704678155481815, + "rewards//mean": 0.7855224609375, + "rewards//std": 0.026989080011844635, + "step": 2949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.59, + "grad_norm": 2.596304178237915, + "kl": 1.575605260208249, + "learning_rate": 3.67068978498107e-07, + "loss": 0.1576, + "num_tokens": 25495785.0, + "reward": 0.7479248046875, + "reward_std": 0.012925012037158012, + "rewards//mean": 0.7479248046875, + "rewards//std": 0.032696619629859924, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5902, + "grad_norm": 2.5986931324005127, + "kl": 1.7214981522411108, + "learning_rate": 3.6676309315498255e-07, + "loss": 0.1721, + "num_tokens": 25504385.0, + "reward": 0.755126953125, + "reward_std": 0.006326187402009964, + "rewards//mean": 0.755126953125, + "rewards//std": 0.02189752086997032, + "step": 2951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5904, + "grad_norm": 1.1624126434326172, + "kl": 0.6151155345141888, + "learning_rate": 3.6645726147966817e-07, + "loss": 0.0615, + "num_tokens": 25513009.0, + "reward": 0.8079833984375, + "reward_std": 0.0030127190984785557, + "rewards//mean": 0.8079833984375, + "rewards//std": 0.023686455562710762, + "step": 2952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5906, + "grad_norm": 4.930422782897949, + "kl": 2.2982791289687157, + "learning_rate": 3.6615148359535295e-07, + "loss": 0.2298, + "num_tokens": 25521705.0, + "reward": 0.72564697265625, + "reward_std": 0.009802762418985367, + "rewards//mean": 0.72564697265625, + "rewards//std": 0.03860560059547424, + "step": 2953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5908, + "grad_norm": 2.8943684101104736, + "kl": 1.1505038067698479, + "learning_rate": 3.6584575962520405e-07, + "loss": 0.1151, + "num_tokens": 25530297.0, + "reward": 0.72845458984375, + "reward_std": 0.0056719426065683365, + "rewards//mean": 0.72845458984375, + "rewards//std": 0.02895691804587841, + "step": 2954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.591, + "grad_norm": 2.5747885704040527, + "kl": 1.9602992683649063, + "learning_rate": 3.6554008969236715e-07, + "loss": 0.196, + "num_tokens": 25538977.0, + "reward": 0.78045654296875, + "reward_std": 0.011159081012010574, + "rewards//mean": 0.78045654296875, + "rewards//std": 0.03001803159713745, + "step": 2955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5912, + "grad_norm": 1.2925382852554321, + "kl": 1.3621100522577763, + "learning_rate": 3.652344739199661e-07, + "loss": 0.1362, + "num_tokens": 25547601.0, + "reward": 0.78363037109375, + "reward_std": 0.008508788421750069, + "rewards//mean": 0.78363037109375, + "rewards//std": 0.027914581820368767, + "step": 2956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5914, + "grad_norm": 2.232034683227539, + "kl": 1.7353591118007898, + "learning_rate": 3.649289124311028e-07, + "loss": 0.1735, + "num_tokens": 25556201.0, + "reward": 0.757568359375, + "reward_std": 0.010058999061584473, + "rewards//mean": 0.757568359375, + "rewards//std": 0.02541922777891159, + "step": 2957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5916, + "grad_norm": 2.947218179702759, + "kl": 0.9271517135202885, + "learning_rate": 3.6462340534885736e-07, + "loss": 0.0927, + "num_tokens": 25564881.0, + "reward": 0.7530517578125, + "reward_std": 0.006846866570413113, + "rewards//mean": 0.7530517578125, + "rewards//std": 0.02915261872112751, + "step": 2958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5918, + "grad_norm": 2.027225971221924, + "kl": 1.370457025244832, + "learning_rate": 3.6431795279628816e-07, + "loss": 0.137, + "num_tokens": 25573545.0, + "reward": 0.74658203125, + "reward_std": 0.006183089688420296, + "rewards//mean": 0.74658203125, + "rewards//std": 0.02777438797056675, + "step": 2959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.592, + "grad_norm": 0.6735411882400513, + "kl": 0.44165719859302044, + "learning_rate": 3.640125548964312e-07, + "loss": 0.0442, + "num_tokens": 25582201.0, + "reward": 0.7603759765625, + "reward_std": 0.0014385588001459837, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.031282681971788406, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5922, + "grad_norm": 2.0789027214050293, + "kl": 1.0543392952531576, + "learning_rate": 3.6370721177230115e-07, + "loss": 0.1054, + "num_tokens": 25590777.0, + "reward": 0.74322509765625, + "reward_std": 0.006943423300981522, + "rewards//mean": 0.74322509765625, + "rewards//std": 0.022110698744654655, + "step": 2961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5924, + "grad_norm": 2.4136569499969482, + "kl": 1.2079179994761944, + "learning_rate": 3.634019235468896e-07, + "loss": 0.1208, + "num_tokens": 25599361.0, + "reward": 0.7669677734375, + "reward_std": 0.005817200988531113, + "rewards//mean": 0.7669677734375, + "rewards//std": 0.026256727054715157, + "step": 2962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5926, + "grad_norm": 3.4735333919525146, + "kl": 1.0767007023096085, + "learning_rate": 3.630966903431671e-07, + "loss": 0.1077, + "num_tokens": 25607937.0, + "reward": 0.75018310546875, + "reward_std": 0.008739624172449112, + "rewards//mean": 0.75018310546875, + "rewards//std": 0.02831784076988697, + "step": 2963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5928, + "grad_norm": 6.7169904708862305, + "kl": 2.0835374910384417, + "learning_rate": 3.627915122840812e-07, + "loss": 0.2084, + "num_tokens": 25616601.0, + "reward": 0.75750732421875, + "reward_std": 0.0069363838993012905, + "rewards//mean": 0.75750732421875, + "rewards//std": 0.027562588453292847, + "step": 2964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.593, + "grad_norm": 3.311948537826538, + "kl": 0.7093781903386116, + "learning_rate": 3.624863894925579e-07, + "loss": 0.0709, + "num_tokens": 25625201.0, + "reward": 0.703369140625, + "reward_std": 0.004012967459857464, + "rewards//mean": 0.703369140625, + "rewards//std": 0.0409565344452858, + "step": 2965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5932, + "grad_norm": 1.880159616470337, + "kl": 0.8023883122950792, + "learning_rate": 3.621813220915004e-07, + "loss": 0.0802, + "num_tokens": 25633745.0, + "reward": 0.75433349609375, + "reward_std": 0.00557766854763031, + "rewards//mean": 0.75433349609375, + "rewards//std": 0.020577358081936836, + "step": 2966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5934, + "grad_norm": 4.903932094573975, + "kl": 1.6108340937644243, + "learning_rate": 3.6187631020378984e-07, + "loss": 0.1611, + "num_tokens": 25642377.0, + "reward": 0.77520751953125, + "reward_std": 0.013591557741165161, + "rewards//mean": 0.77520751953125, + "rewards//std": 0.026096759364008904, + "step": 2967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5936, + "grad_norm": 3.1622378826141357, + "kl": 1.5614755414426327, + "learning_rate": 3.615713539522851e-07, + "loss": 0.1561, + "num_tokens": 25651057.0, + "reward": 0.7733154296875, + "reward_std": 0.008702712133526802, + "rewards//mean": 0.7733154296875, + "rewards//std": 0.026115676388144493, + "step": 2968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5938, + "grad_norm": 2.1004228591918945, + "kl": 2.1369469705969095, + "learning_rate": 3.6126645345982237e-07, + "loss": 0.2137, + "num_tokens": 25659673.0, + "reward": 0.77166748046875, + "reward_std": 0.01444825530052185, + "rewards//mean": 0.77166748046875, + "rewards//std": 0.028636783361434937, + "step": 2969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.594, + "grad_norm": 3.3864035606384277, + "kl": 1.1602793503552675, + "learning_rate": 3.609616088492157e-07, + "loss": 0.116, + "num_tokens": 25668305.0, + "reward": 0.761962890625, + "reward_std": 0.009322209283709526, + "rewards//mean": 0.761962890625, + "rewards//std": 0.029114428907632828, + "step": 2970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5942, + "grad_norm": 1.9126341342926025, + "kl": 1.4768512472510338, + "learning_rate": 3.6065682024325617e-07, + "loss": 0.1477, + "num_tokens": 25676921.0, + "reward": 0.72210693359375, + "reward_std": 0.006451964378356934, + "rewards//mean": 0.72210693359375, + "rewards//std": 0.036457452923059464, + "step": 2971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5944, + "grad_norm": 2.2300121784210205, + "kl": 1.2102240789681673, + "learning_rate": 3.603520877647129e-07, + "loss": 0.121, + "num_tokens": 25685569.0, + "reward": 0.74456787109375, + "reward_std": 0.00720410980284214, + "rewards//mean": 0.74456787109375, + "rewards//std": 0.029843546450138092, + "step": 2972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5946, + "grad_norm": 2.029755115509033, + "kl": 2.033790700137615, + "learning_rate": 3.6004741153633187e-07, + "loss": 0.2034, + "num_tokens": 25694161.0, + "reward": 0.787841796875, + "reward_std": 0.016861075535416603, + "rewards//mean": 0.787841796875, + "rewards//std": 0.03470119461417198, + "step": 2973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5948, + "grad_norm": 1.7813624143600464, + "kl": 1.2202390506863594, + "learning_rate": 3.597427916808369e-07, + "loss": 0.122, + "num_tokens": 25702881.0, + "reward": 0.78887939453125, + "reward_std": 0.006985452491790056, + "rewards//mean": 0.78887939453125, + "rewards//std": 0.01716761477291584, + "step": 2974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.595, + "grad_norm": 2.9879825115203857, + "kl": 1.4840282164514065, + "learning_rate": 3.594382283209286e-07, + "loss": 0.1484, + "num_tokens": 25711545.0, + "reward": 0.7510986328125, + "reward_std": 0.005209947936236858, + "rewards//mean": 0.7510986328125, + "rewards//std": 0.03723704069852829, + "step": 2975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5952, + "grad_norm": 5.521975517272949, + "kl": 1.577912151813507, + "learning_rate": 3.591337215792851e-07, + "loss": 0.1578, + "num_tokens": 25720241.0, + "reward": 0.7490234375, + "reward_std": 0.005530821159482002, + "rewards//mean": 0.7490234375, + "rewards//std": 0.03996962681412697, + "step": 2976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5954, + "grad_norm": 7.2982964515686035, + "kl": 2.7387282866984606, + "learning_rate": 3.5882927157856167e-07, + "loss": 0.2739, + "num_tokens": 25728921.0, + "reward": 0.7418212890625, + "reward_std": 0.015395080670714378, + "rewards//mean": 0.7418212890625, + "rewards//std": 0.038718219846487045, + "step": 2977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5956, + "grad_norm": 0.6074886918067932, + "kl": 0.7812817897647619, + "learning_rate": 3.585248784413909e-07, + "loss": 0.0781, + "num_tokens": 25737505.0, + "reward": 0.7657470703125, + "reward_std": 0.0013686248566955328, + "rewards//mean": 0.7657470703125, + "rewards//std": 0.027778474614024162, + "step": 2978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5958, + "grad_norm": 3.106153964996338, + "kl": 1.0422262605279684, + "learning_rate": 3.58220542290382e-07, + "loss": 0.1042, + "num_tokens": 25746225.0, + "reward": 0.72613525390625, + "reward_std": 0.004918671678751707, + "rewards//mean": 0.72613525390625, + "rewards//std": 0.03768090903759003, + "step": 2979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.596, + "grad_norm": 3.5232458114624023, + "kl": 1.103571331128478, + "learning_rate": 3.5791626324812185e-07, + "loss": 0.1104, + "num_tokens": 25754785.0, + "reward": 0.75823974609375, + "reward_std": 0.006218265276402235, + "rewards//mean": 0.75823974609375, + "rewards//std": 0.02603345550596714, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5962, + "grad_norm": 9.581036567687988, + "kl": 2.1684041023254395, + "learning_rate": 3.5761204143717385e-07, + "loss": 0.2168, + "num_tokens": 25763377.0, + "reward": 0.7244873046875, + "reward_std": 0.007560579106211662, + "rewards//mean": 0.7244873046875, + "rewards//std": 0.04088013991713524, + "step": 2981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5964, + "grad_norm": 2.5813968181610107, + "kl": 1.0106823053210974, + "learning_rate": 3.5730787698007846e-07, + "loss": 0.1011, + "num_tokens": 25771929.0, + "reward": 0.7445068359375, + "reward_std": 0.005810283124446869, + "rewards//mean": 0.7445068359375, + "rewards//std": 0.02315386012196541, + "step": 2982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5966, + "grad_norm": 2.5577991008758545, + "kl": 0.9984627012163401, + "learning_rate": 3.5700376999935334e-07, + "loss": 0.0998, + "num_tokens": 25780593.0, + "reward": 0.77032470703125, + "reward_std": 0.011613905429840088, + "rewards//mean": 0.77032470703125, + "rewards//std": 0.032712992280721664, + "step": 2983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5968, + "grad_norm": 4.7441792488098145, + "kl": 2.6448914166539907, + "learning_rate": 3.566997206174923e-07, + "loss": 0.2645, + "num_tokens": 25789281.0, + "reward": 0.75335693359375, + "reward_std": 0.014223654754459858, + "rewards//mean": 0.75335693359375, + "rewards//std": 0.04030674323439598, + "step": 2984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.597, + "grad_norm": 1.9822399616241455, + "kl": 1.4205818120390177, + "learning_rate": 3.5639572895696687e-07, + "loss": 0.1421, + "num_tokens": 25797961.0, + "reward": 0.79095458984375, + "reward_std": 0.006721897050738335, + "rewards//mean": 0.79095458984375, + "rewards//std": 0.022498900070786476, + "step": 2985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5972, + "grad_norm": 1.452100157737732, + "kl": 1.7771408930420876, + "learning_rate": 3.5609179514022446e-07, + "loss": 0.1777, + "num_tokens": 25806569.0, + "reward": 0.740966796875, + "reward_std": 0.012258218601346016, + "rewards//mean": 0.740966796875, + "rewards//std": 0.031603556126356125, + "step": 2986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5974, + "grad_norm": 4.4297590255737305, + "kl": 1.708623206242919, + "learning_rate": 3.5578791928968993e-07, + "loss": 0.1709, + "num_tokens": 25815449.0, + "reward": 0.77166748046875, + "reward_std": 0.011401031166315079, + "rewards//mean": 0.77166748046875, + "rewards//std": 0.025093700736761093, + "step": 2987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5976, + "grad_norm": 4.178742408752441, + "kl": 1.368068354204297, + "learning_rate": 3.554841015277641e-07, + "loss": 0.1368, + "num_tokens": 25824017.0, + "reward": 0.76422119140625, + "reward_std": 0.00439481483772397, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.0166073739528656, + "step": 2988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5978, + "grad_norm": 5.577917575836182, + "kl": 1.2175132408738136, + "learning_rate": 3.551803419768251e-07, + "loss": 0.1218, + "num_tokens": 25832609.0, + "reward": 0.76849365234375, + "reward_std": 0.006278870161622763, + "rewards//mean": 0.76849365234375, + "rewards//std": 0.031127629801630974, + "step": 2989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.598, + "grad_norm": 2.201782464981079, + "kl": 1.2101694513112307, + "learning_rate": 3.5487664075922686e-07, + "loss": 0.121, + "num_tokens": 25841257.0, + "reward": 0.77191162109375, + "reward_std": 0.0070481267757713795, + "rewards//mean": 0.77191162109375, + "rewards//std": 0.021661117672920227, + "step": 2990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5982, + "grad_norm": 3.317814826965332, + "kl": 1.983419205993414, + "learning_rate": 3.5457299799730045e-07, + "loss": 0.1983, + "num_tokens": 25849849.0, + "reward": 0.775634765625, + "reward_std": 0.017710991203784943, + "rewards//mean": 0.775634765625, + "rewards//std": 0.035440150648355484, + "step": 2991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5984, + "grad_norm": 2.7815587520599365, + "kl": 1.920902220532298, + "learning_rate": 3.5426941381335296e-07, + "loss": 0.1921, + "num_tokens": 25858489.0, + "reward": 0.77154541015625, + "reward_std": 0.01617206260561943, + "rewards//mean": 0.77154541015625, + "rewards//std": 0.02475418895483017, + "step": 2992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5986, + "grad_norm": 3.550283193588257, + "kl": 2.604361003264785, + "learning_rate": 3.5396588832966824e-07, + "loss": 0.2604, + "num_tokens": 25867137.0, + "reward": 0.7188720703125, + "reward_std": 0.013169539161026478, + "rewards//mean": 0.7188720703125, + "rewards//std": 0.040193021297454834, + "step": 2993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5988, + "grad_norm": 4.1006269454956055, + "kl": 1.754389937967062, + "learning_rate": 3.536624216685062e-07, + "loss": 0.1754, + "num_tokens": 25875753.0, + "reward": 0.75006103515625, + "reward_std": 0.013646166771650314, + "rewards//mean": 0.75006103515625, + "rewards//std": 0.03968070074915886, + "step": 2994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.599, + "grad_norm": 3.319735527038574, + "kl": 1.5468129850924015, + "learning_rate": 3.5335901395210326e-07, + "loss": 0.1547, + "num_tokens": 25884441.0, + "reward": 0.78729248046875, + "reward_std": 0.019312655553221703, + "rewards//mean": 0.78729248046875, + "rewards//std": 0.02796063758432865, + "step": 2995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5992, + "grad_norm": 4.306685447692871, + "kl": 1.9277800042182207, + "learning_rate": 3.530556653026721e-07, + "loss": 0.1928, + "num_tokens": 25893105.0, + "reward": 0.69921875, + "reward_std": 0.01398519054055214, + "rewards//mean": 0.69921875, + "rewards//std": 0.04721992090344429, + "step": 2996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5994, + "grad_norm": 10.92469596862793, + "kl": 2.6271385326981544, + "learning_rate": 3.5275237584240123e-07, + "loss": 0.2627, + "num_tokens": 25901857.0, + "reward": 0.755615234375, + "reward_std": 0.010458522476255894, + "rewards//mean": 0.755615234375, + "rewards//std": 0.03322500362992287, + "step": 2997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5996, + "grad_norm": 2.0316758155822754, + "kl": 1.413595201447606, + "learning_rate": 3.5244914569345574e-07, + "loss": 0.1414, + "num_tokens": 25910489.0, + "reward": 0.77655029296875, + "reward_std": 0.012451614253222942, + "rewards//mean": 0.77655029296875, + "rewards//std": 0.03838815167546272, + "step": 2998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.5998, + "grad_norm": 7.328047275543213, + "kl": 2.106111042201519, + "learning_rate": 3.521459749779768e-07, + "loss": 0.2106, + "num_tokens": 25919177.0, + "reward": 0.7867431640625, + "reward_std": 0.009804660454392433, + "rewards//mean": 0.7867431640625, + "rewards//std": 0.03047080710530281, + "step": 2999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6, + "grad_norm": 1.698675513267517, + "kl": 1.7684502508491278, + "learning_rate": 3.518428638180813e-07, + "loss": 0.1768, + "num_tokens": 25927729.0, + "reward": 0.7545166015625, + "reward_std": 0.009782938286662102, + "rewards//mean": 0.7545166015625, + "rewards//std": 0.041481517255306244, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6002, + "grad_norm": 3.516226053237915, + "kl": 1.4426108710467815, + "learning_rate": 3.5153981233586274e-07, + "loss": 0.1443, + "num_tokens": 25936321.0, + "reward": 0.77178955078125, + "reward_std": 0.011105319485068321, + "rewards//mean": 0.77178955078125, + "rewards//std": 0.02826112136244774, + "step": 3001 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6004, + "grad_norm": 1.5403438806533813, + "kl": 1.6524803712964058, + "learning_rate": 3.512368206533898e-07, + "loss": 0.1652, + "num_tokens": 25944945.0, + "reward": 0.75640869140625, + "reward_std": 0.008769109845161438, + "rewards//mean": 0.75640869140625, + "rewards//std": 0.02786899171769619, + "step": 3002 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6006, + "grad_norm": 4.581449031829834, + "kl": 2.0956821255385876, + "learning_rate": 3.509338888927079e-07, + "loss": 0.2096, + "num_tokens": 25953569.0, + "reward": 0.76666259765625, + "reward_std": 0.011500043794512749, + "rewards//mean": 0.76666259765625, + "rewards//std": 0.03810357302427292, + "step": 3003 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6008, + "grad_norm": 3.164768695831299, + "kl": 2.080216048285365, + "learning_rate": 3.506310171758375e-07, + "loss": 0.208, + "num_tokens": 25962201.0, + "reward": 0.771728515625, + "reward_std": 0.020446758717298508, + "rewards//mean": 0.771728515625, + "rewards//std": 0.034694213420152664, + "step": 3004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.601, + "grad_norm": 1.5213747024536133, + "kl": 1.834243943914771, + "learning_rate": 3.503282056247757e-07, + "loss": 0.1834, + "num_tokens": 25970849.0, + "reward": 0.7191162109375, + "reward_std": 0.013543836772441864, + "rewards//mean": 0.7191162109375, + "rewards//std": 0.040837161242961884, + "step": 3005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6012, + "grad_norm": 2.71396803855896, + "kl": 1.4250467289239168, + "learning_rate": 3.500254543614947e-07, + "loss": 0.1425, + "num_tokens": 25979465.0, + "reward": 0.75128173828125, + "reward_std": 0.008379924111068249, + "rewards//mean": 0.75128173828125, + "rewards//std": 0.028915589675307274, + "step": 3006 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6014, + "grad_norm": 2.806899309158325, + "kl": 1.9753188397735357, + "learning_rate": 3.4972276350794284e-07, + "loss": 0.1975, + "num_tokens": 25988089.0, + "reward": 0.75347900390625, + "reward_std": 0.017587844282388687, + "rewards//mean": 0.75347900390625, + "rewards//std": 0.03568018972873688, + "step": 3007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6016, + "grad_norm": 2.5037572383880615, + "kl": 1.7887899596244097, + "learning_rate": 3.494201331860438e-07, + "loss": 0.1789, + "num_tokens": 25996657.0, + "reward": 0.749755859375, + "reward_std": 0.01157643087208271, + "rewards//mean": 0.749755859375, + "rewards//std": 0.03492382913827896, + "step": 3008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6018, + "grad_norm": 1.4507462978363037, + "kl": 1.1158835459500551, + "learning_rate": 3.4911756351769716e-07, + "loss": 0.1116, + "num_tokens": 26005305.0, + "reward": 0.76885986328125, + "reward_std": 0.006346943322569132, + "rewards//mean": 0.76885986328125, + "rewards//std": 0.029826803132891655, + "step": 3009 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.602, + "grad_norm": 2.2666609287261963, + "kl": 1.5605859085917473, + "learning_rate": 3.488150546247778e-07, + "loss": 0.1561, + "num_tokens": 26013993.0, + "reward": 0.7403564453125, + "reward_std": 0.011423186399042606, + "rewards//mean": 0.7403564453125, + "rewards//std": 0.03735717758536339, + "step": 3010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6022, + "grad_norm": 3.2989916801452637, + "kl": 0.9814637470990419, + "learning_rate": 3.485126066291364e-07, + "loss": 0.0981, + "num_tokens": 26022545.0, + "reward": 0.75433349609375, + "reward_std": 0.0057829637080430984, + "rewards//mean": 0.75433349609375, + "rewards//std": 0.04057773947715759, + "step": 3011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6024, + "grad_norm": 1.7241014242172241, + "kl": 1.1938377879559994, + "learning_rate": 3.48210219652599e-07, + "loss": 0.1194, + "num_tokens": 26031281.0, + "reward": 0.733154296875, + "reward_std": 0.0060042389668524265, + "rewards//mean": 0.733154296875, + "rewards//std": 0.028321649879217148, + "step": 3012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6026, + "grad_norm": 2.987412691116333, + "kl": 1.7079011872410774, + "learning_rate": 3.4790789381696685e-07, + "loss": 0.1708, + "num_tokens": 26039937.0, + "reward": 0.71392822265625, + "reward_std": 0.007207636721432209, + "rewards//mean": 0.71392822265625, + "rewards//std": 0.031441181898117065, + "step": 3013 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6028, + "grad_norm": 4.491795063018799, + "kl": 1.636379435658455, + "learning_rate": 3.4760562924401706e-07, + "loss": 0.1636, + "num_tokens": 26048673.0, + "reward": 0.76654052734375, + "reward_std": 0.013729985803365707, + "rewards//mean": 0.76654052734375, + "rewards//std": 0.03246448189020157, + "step": 3014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.603, + "grad_norm": 3.7369182109832764, + "kl": 1.7946348879486322, + "learning_rate": 3.4730342605550134e-07, + "loss": 0.1795, + "num_tokens": 26057337.0, + "reward": 0.76202392578125, + "reward_std": 0.01479298621416092, + "rewards//mean": 0.76202392578125, + "rewards//std": 0.03637389838695526, + "step": 3015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6032, + "grad_norm": 2.633269786834717, + "kl": 1.3660208452492952, + "learning_rate": 3.470012843731476e-07, + "loss": 0.1366, + "num_tokens": 26066033.0, + "reward": 0.7716064453125, + "reward_std": 0.011088110506534576, + "rewards//mean": 0.7716064453125, + "rewards//std": 0.029322441667318344, + "step": 3016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6034, + "grad_norm": 1.3917341232299805, + "kl": 0.7705397736281157, + "learning_rate": 3.4669920431865795e-07, + "loss": 0.0771, + "num_tokens": 26074737.0, + "reward": 0.77545166015625, + "reward_std": 0.001434464706107974, + "rewards//mean": 0.77545166015625, + "rewards//std": 0.01984485611319542, + "step": 3017 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6036, + "grad_norm": 1.7201029062271118, + "kl": 0.7895236238837242, + "learning_rate": 3.463971860137107e-07, + "loss": 0.079, + "num_tokens": 26083305.0, + "reward": 0.73858642578125, + "reward_std": 0.004490915685892105, + "rewards//mean": 0.73858642578125, + "rewards//std": 0.0251840241253376, + "step": 3018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6038, + "grad_norm": 0.7892788648605347, + "kl": 0.8609472755342722, + "learning_rate": 3.460952295799584e-07, + "loss": 0.0861, + "num_tokens": 26091913.0, + "reward": 0.77264404296875, + "reward_std": 0.004325044807046652, + "rewards//mean": 0.77264404296875, + "rewards//std": 0.0282557625323534, + "step": 3019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.604, + "grad_norm": 4.398412704467773, + "kl": 1.5428178068250418, + "learning_rate": 3.457933351390293e-07, + "loss": 0.1543, + "num_tokens": 26100489.0, + "reward": 0.75726318359375, + "reward_std": 0.011328289285302162, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.042042043060064316, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6042, + "grad_norm": 1.3493902683258057, + "kl": 0.8882236573845148, + "learning_rate": 3.454915028125263e-07, + "loss": 0.0888, + "num_tokens": 26109129.0, + "reward": 0.7733154296875, + "reward_std": 0.00365462526679039, + "rewards//mean": 0.7733154296875, + "rewards//std": 0.03083624318242073, + "step": 3021 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6044, + "grad_norm": 5.798737049102783, + "kl": 1.5297288578003645, + "learning_rate": 3.451897327220276e-07, + "loss": 0.153, + "num_tokens": 26117889.0, + "reward": 0.7371826171875, + "reward_std": 0.004250944592058659, + "rewards//mean": 0.7371826171875, + "rewards//std": 0.044004522264003754, + "step": 3022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6046, + "grad_norm": 1.6715644598007202, + "kl": 1.9425526298582554, + "learning_rate": 3.448880249890859e-07, + "loss": 0.1943, + "num_tokens": 26126529.0, + "reward": 0.749267578125, + "reward_std": 0.012378041632473469, + "rewards//mean": 0.749267578125, + "rewards//std": 0.033261433243751526, + "step": 3023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6048, + "grad_norm": 3.531263828277588, + "kl": 2.101448990404606, + "learning_rate": 3.445863797352293e-07, + "loss": 0.2101, + "num_tokens": 26135217.0, + "reward": 0.72637939453125, + "reward_std": 0.007120020221918821, + "rewards//mean": 0.72637939453125, + "rewards//std": 0.034445878118276596, + "step": 3024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.605, + "grad_norm": 2.046232223510742, + "kl": 1.192565981298685, + "learning_rate": 3.4428479708196033e-07, + "loss": 0.1193, + "num_tokens": 26143825.0, + "reward": 0.7681884765625, + "reward_std": 0.009021867997944355, + "rewards//mean": 0.7681884765625, + "rewards//std": 0.027737028896808624, + "step": 3025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6052, + "grad_norm": 4.147281646728516, + "kl": 1.768487649038434, + "learning_rate": 3.439832771507565e-07, + "loss": 0.1768, + "num_tokens": 26152433.0, + "reward": 0.7225341796875, + "reward_std": 0.009325562976300716, + "rewards//mean": 0.7225341796875, + "rewards//std": 0.03043302893638611, + "step": 3026 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6054, + "grad_norm": 1.0630336999893188, + "kl": 0.956812996417284, + "learning_rate": 3.4368182006307e-07, + "loss": 0.0957, + "num_tokens": 26161097.0, + "reward": 0.771728515625, + "reward_std": 0.004608482588082552, + "rewards//mean": 0.771728515625, + "rewards//std": 0.032067637890577316, + "step": 3027 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6056, + "grad_norm": 2.7875843048095703, + "kl": 1.260486289858818, + "learning_rate": 3.433804259403276e-07, + "loss": 0.126, + "num_tokens": 26169753.0, + "reward": 0.77166748046875, + "reward_std": 0.012122110463678837, + "rewards//mean": 0.77166748046875, + "rewards//std": 0.028253620490431786, + "step": 3028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6058, + "grad_norm": 6.932296276092529, + "kl": 1.663364239037037, + "learning_rate": 3.430790949039309e-07, + "loss": 0.1663, + "num_tokens": 26178433.0, + "reward": 0.779541015625, + "reward_std": 0.008525843732059002, + "rewards//mean": 0.779541015625, + "rewards//std": 0.027760213240981102, + "step": 3029 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.606, + "grad_norm": 2.210796356201172, + "kl": 0.8479131422936916, + "learning_rate": 3.4277782707525603e-07, + "loss": 0.0848, + "num_tokens": 26187057.0, + "reward": 0.76690673828125, + "reward_std": 0.0033338924404233694, + "rewards//mean": 0.76690673828125, + "rewards//std": 0.01960001140832901, + "step": 3030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6062, + "grad_norm": 1.2629185914993286, + "kl": 0.5924016125500202, + "learning_rate": 3.4247662257565366e-07, + "loss": 0.0592, + "num_tokens": 26195705.0, + "reward": 0.756591796875, + "reward_std": 0.0019523652736097574, + "rewards//mean": 0.756591796875, + "rewards//std": 0.022128576412796974, + "step": 3031 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6064, + "grad_norm": 2.329953670501709, + "kl": 1.1548113971948624, + "learning_rate": 3.421754815264488e-07, + "loss": 0.1155, + "num_tokens": 26204385.0, + "reward": 0.7384033203125, + "reward_std": 0.00849184300750494, + "rewards//mean": 0.7384033203125, + "rewards//std": 0.031218741089105606, + "step": 3032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6066, + "grad_norm": 1.1048036813735962, + "kl": 1.5606459695845842, + "learning_rate": 3.418744040489412e-07, + "loss": 0.1561, + "num_tokens": 26213065.0, + "reward": 0.767333984375, + "reward_std": 0.010069664567708969, + "rewards//mean": 0.767333984375, + "rewards//std": 0.036210861057043076, + "step": 3033 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6068, + "grad_norm": 4.638854026794434, + "kl": 1.4733854681253433, + "learning_rate": 3.415733902644046e-07, + "loss": 0.1473, + "num_tokens": 26221649.0, + "reward": 0.7706298828125, + "reward_std": 0.012891530990600586, + "rewards//mean": 0.7706298828125, + "rewards//std": 0.032633595168590546, + "step": 3034 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.607, + "grad_norm": 0.7102227807044983, + "kl": 0.6467064693570137, + "learning_rate": 3.4127244029408756e-07, + "loss": 0.0647, + "num_tokens": 26230313.0, + "reward": 0.79351806640625, + "reward_std": 0.0013707405887544155, + "rewards//mean": 0.79351806640625, + "rewards//std": 0.019548196345567703, + "step": 3035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6072, + "grad_norm": 1.6537208557128906, + "kl": 1.24411166831851, + "learning_rate": 3.4097155425921256e-07, + "loss": 0.1244, + "num_tokens": 26239033.0, + "reward": 0.73028564453125, + "reward_std": 0.010103070177137852, + "rewards//mean": 0.73028564453125, + "rewards//std": 0.039435409009456635, + "step": 3036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6074, + "grad_norm": 1.9461979866027832, + "kl": 1.70147574134171, + "learning_rate": 3.4067073228097655e-07, + "loss": 0.1701, + "num_tokens": 26247689.0, + "reward": 0.741943359375, + "reward_std": 0.006666326895356178, + "rewards//mean": 0.741943359375, + "rewards//std": 0.025984639301896095, + "step": 3037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6076, + "grad_norm": 6.178399085998535, + "kl": 1.6182691976428032, + "learning_rate": 3.4036997448055036e-07, + "loss": 0.1618, + "num_tokens": 26256337.0, + "reward": 0.7581787109375, + "reward_std": 0.008596446365118027, + "rewards//mean": 0.7581787109375, + "rewards//std": 0.038201820105314255, + "step": 3038 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6078, + "grad_norm": 9.092072486877441, + "kl": 1.380062386393547, + "learning_rate": 3.4006928097907954e-07, + "loss": 0.138, + "num_tokens": 26265009.0, + "reward": 0.7705078125, + "reward_std": 0.008681556209921837, + "rewards//mean": 0.7705078125, + "rewards//std": 0.02412998303771019, + "step": 3039 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.608, + "grad_norm": 0.8963751196861267, + "kl": 1.091773722320795, + "learning_rate": 3.397686518976831e-07, + "loss": 0.1092, + "num_tokens": 26273649.0, + "reward": 0.7603759765625, + "reward_std": 0.005628220271319151, + "rewards//mean": 0.7603759765625, + "rewards//std": 0.017747391015291214, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6082, + "grad_norm": 1.6986956596374512, + "kl": 1.5452642608433962, + "learning_rate": 3.394680873574546e-07, + "loss": 0.1545, + "num_tokens": 26282329.0, + "reward": 0.73553466796875, + "reward_std": 0.009731960482895374, + "rewards//mean": 0.73553466796875, + "rewards//std": 0.032575733959674835, + "step": 3041 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6084, + "grad_norm": 4.254377841949463, + "kl": 0.8951471261680126, + "learning_rate": 3.391675874794612e-07, + "loss": 0.0895, + "num_tokens": 26291057.0, + "reward": 0.72967529296875, + "reward_std": 0.003615229856222868, + "rewards//mean": 0.72967529296875, + "rewards//std": 0.019227294251322746, + "step": 3042 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6086, + "grad_norm": 1.8046212196350098, + "kl": 0.7944396063685417, + "learning_rate": 3.388671523847445e-07, + "loss": 0.0794, + "num_tokens": 26299681.0, + "reward": 0.77142333984375, + "reward_std": 0.0031921416521072388, + "rewards//mean": 0.77142333984375, + "rewards//std": 0.024119865149259567, + "step": 3043 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 127.59375, + "epoch": 0.6088, + "grad_norm": 7.896710395812988, + "kl": 1.784462120383978, + "learning_rate": 3.3856678219431944e-07, + "loss": 0.1714, + "num_tokens": 26308335.0, + "reward": 0.7510986328125, + "reward_std": 0.009847967885434628, + "rewards//mean": 0.7510986328125, + "rewards//std": 0.03741871938109398, + "step": 3044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.609, + "grad_norm": 4.971747875213623, + "kl": 0.8912597857415676, + "learning_rate": 3.382664770291752e-07, + "loss": 0.0891, + "num_tokens": 26316895.0, + "reward": 0.76812744140625, + "reward_std": 0.007516819983720779, + "rewards//mean": 0.76812744140625, + "rewards//std": 0.025698702782392502, + "step": 3045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6092, + "grad_norm": 2.7133586406707764, + "kl": 2.0692453887313604, + "learning_rate": 3.3796623701027473e-07, + "loss": 0.2069, + "num_tokens": 26325503.0, + "reward": 0.7222900390625, + "reward_std": 0.010012170299887657, + "rewards//mean": 0.7222900390625, + "rewards//std": 0.026481760665774345, + "step": 3046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6094, + "grad_norm": 1.0347590446472168, + "kl": 0.6974033433943987, + "learning_rate": 3.376660622585545e-07, + "loss": 0.0697, + "num_tokens": 26334143.0, + "reward": 0.80157470703125, + "reward_std": 0.0033798220101743937, + "rewards//mean": 0.80157470703125, + "rewards//std": 0.0249412190169096, + "step": 3047 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6096, + "grad_norm": 0.6898975372314453, + "kl": 0.7912088725715876, + "learning_rate": 3.373659528949251e-07, + "loss": 0.0791, + "num_tokens": 26342735.0, + "reward": 0.75244140625, + "reward_std": 0.004709369502961636, + "rewards//mean": 0.75244140625, + "rewards//std": 0.02856539748609066, + "step": 3048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6098, + "grad_norm": 3.180156707763672, + "kl": 0.9306967034935951, + "learning_rate": 3.370659090402703e-07, + "loss": 0.0931, + "num_tokens": 26351399.0, + "reward": 0.7689208984375, + "reward_std": 0.009557828307151794, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.02485404722392559, + "step": 3049 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.61, + "grad_norm": 4.688920021057129, + "kl": 3.0295713804662228, + "learning_rate": 3.36765930815448e-07, + "loss": 0.303, + "num_tokens": 26359943.0, + "reward": 0.71453857421875, + "reward_std": 0.015398615971207619, + "rewards//mean": 0.71453857421875, + "rewards//std": 0.040327392518520355, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6102, + "grad_norm": 1.5733107328414917, + "kl": 1.0377127453684807, + "learning_rate": 3.3646601834128916e-07, + "loss": 0.1038, + "num_tokens": 26368607.0, + "reward": 0.73211669921875, + "reward_std": 0.0034231869503855705, + "rewards//mean": 0.73211669921875, + "rewards//std": 0.03208975866436958, + "step": 3051 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6104, + "grad_norm": 3.0359301567077637, + "kl": 1.3004213366657495, + "learning_rate": 3.361661717385986e-07, + "loss": 0.13, + "num_tokens": 26377287.0, + "reward": 0.77276611328125, + "reward_std": 0.008777014911174774, + "rewards//mean": 0.77276611328125, + "rewards//std": 0.025691043585538864, + "step": 3052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6106, + "grad_norm": 1.7775403261184692, + "kl": 1.2840811777859926, + "learning_rate": 3.358663911281544e-07, + "loss": 0.1284, + "num_tokens": 26385847.0, + "reward": 0.73748779296875, + "reward_std": 0.004279830493032932, + "rewards//mean": 0.73748779296875, + "rewards//std": 0.03302249312400818, + "step": 3053 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6108, + "grad_norm": 2.383028268814087, + "kl": 1.0032009296119213, + "learning_rate": 3.3556667663070835e-07, + "loss": 0.1003, + "num_tokens": 26394471.0, + "reward": 0.74566650390625, + "reward_std": 0.0037385423202067614, + "rewards//mean": 0.74566650390625, + "rewards//std": 0.012256534770131111, + "step": 3054 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.611, + "grad_norm": 2.8626515865325928, + "kl": 1.6999119650572538, + "learning_rate": 3.3526702836698515e-07, + "loss": 0.17, + "num_tokens": 26403023.0, + "reward": 0.74169921875, + "reward_std": 0.01018468290567398, + "rewards//mean": 0.74169921875, + "rewards//std": 0.025548720732331276, + "step": 3055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6112, + "grad_norm": 2.1783156394958496, + "kl": 0.8691457733511925, + "learning_rate": 3.349674464576834e-07, + "loss": 0.0869, + "num_tokens": 26411583.0, + "reward": 0.76385498046875, + "reward_std": 0.004479769617319107, + "rewards//mean": 0.76385498046875, + "rewards//std": 0.023724369704723358, + "step": 3056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6114, + "grad_norm": 3.5689773559570312, + "kl": 1.7653647121042013, + "learning_rate": 3.3466793102347433e-07, + "loss": 0.1765, + "num_tokens": 26420207.0, + "reward": 0.73162841796875, + "reward_std": 0.014874370768666267, + "rewards//mean": 0.73162841796875, + "rewards//std": 0.04025149717926979, + "step": 3057 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6116, + "grad_norm": 4.278613090515137, + "kl": 1.2750835418701172, + "learning_rate": 3.34368482185003e-07, + "loss": 0.1275, + "num_tokens": 26428839.0, + "reward": 0.7366943359375, + "reward_std": 0.0087828878313303, + "rewards//mean": 0.7366943359375, + "rewards//std": 0.025370048359036446, + "step": 3058 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6118, + "grad_norm": 2.8363375663757324, + "kl": 1.9835983160883188, + "learning_rate": 3.3406910006288716e-07, + "loss": 0.1984, + "num_tokens": 26437415.0, + "reward": 0.73077392578125, + "reward_std": 0.01027078740298748, + "rewards//mean": 0.73077392578125, + "rewards//std": 0.03327184170484543, + "step": 3059 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.612, + "grad_norm": 1.3819901943206787, + "kl": 1.2905705105513334, + "learning_rate": 3.337697847777179e-07, + "loss": 0.1291, + "num_tokens": 26446031.0, + "reward": 0.77630615234375, + "reward_std": 0.009390534833073616, + "rewards//mean": 0.77630615234375, + "rewards//std": 0.025960085913538933, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6122, + "grad_norm": 3.1859402656555176, + "kl": 1.5130952596664429, + "learning_rate": 3.3347053645005965e-07, + "loss": 0.1513, + "num_tokens": 26454599.0, + "reward": 0.7662353515625, + "reward_std": 0.006422541104257107, + "rewards//mean": 0.7662353515625, + "rewards//std": 0.023829180747270584, + "step": 3061 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 126.828125, + "epoch": 0.6124, + "grad_norm": 10.052289009094238, + "kl": 2.5300799626857042, + "learning_rate": 3.331713552004492e-07, + "loss": 0.2463, + "num_tokens": 26463308.0, + "reward": 0.7520751953125, + "reward_std": 0.012518838047981262, + "rewards//mean": 0.7520751953125, + "rewards//std": 0.03320381045341492, + "step": 3062 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6126, + "grad_norm": 4.029149532318115, + "kl": 1.5815801564604044, + "learning_rate": 3.3287224114939704e-07, + "loss": 0.1582, + "num_tokens": 26471916.0, + "reward": 0.76641845703125, + "reward_std": 0.01160445250570774, + "rewards//mean": 0.76641845703125, + "rewards//std": 0.03243042528629303, + "step": 3063 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6128, + "grad_norm": 1.440954327583313, + "kl": 0.8544907737523317, + "learning_rate": 3.325731944173861e-07, + "loss": 0.0854, + "num_tokens": 26480508.0, + "reward": 0.76568603515625, + "reward_std": 0.006779256742447615, + "rewards//mean": 0.76568603515625, + "rewards//std": 0.03256690502166748, + "step": 3064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.613, + "grad_norm": 2.6406941413879395, + "kl": 1.875879967585206, + "learning_rate": 3.3227421512487255e-07, + "loss": 0.1876, + "num_tokens": 26489100.0, + "reward": 0.75494384765625, + "reward_std": 0.009458349086344242, + "rewards//mean": 0.75494384765625, + "rewards//std": 0.03300094231963158, + "step": 3065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6132, + "grad_norm": 3.960733652114868, + "kl": 1.5292521957308054, + "learning_rate": 3.319753033922849e-07, + "loss": 0.1529, + "num_tokens": 26497772.0, + "reward": 0.733154296875, + "reward_std": 0.008773379027843475, + "rewards//mean": 0.733154296875, + "rewards//std": 0.03186305984854698, + "step": 3066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6134, + "grad_norm": 2.836421012878418, + "kl": 0.7934996373951435, + "learning_rate": 3.316764593400251e-07, + "loss": 0.0793, + "num_tokens": 26506420.0, + "reward": 0.76751708984375, + "reward_std": 0.0041528730653226376, + "rewards//mean": 0.76751708984375, + "rewards//std": 0.024149971082806587, + "step": 3067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6136, + "grad_norm": 2.7055716514587402, + "kl": 1.3978541549295187, + "learning_rate": 3.313776830884672e-07, + "loss": 0.1398, + "num_tokens": 26515060.0, + "reward": 0.7655029296875, + "reward_std": 0.011144038289785385, + "rewards//mean": 0.7655029296875, + "rewards//std": 0.03049663081765175, + "step": 3068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6138, + "grad_norm": 1.3312785625457764, + "kl": 1.3039655592292547, + "learning_rate": 3.3107897475795855e-07, + "loss": 0.1304, + "num_tokens": 26523868.0, + "reward": 0.75, + "reward_std": 0.006555340252816677, + "rewards//mean": 0.75, + "rewards//std": 0.030193578451871872, + "step": 3069 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.614, + "grad_norm": 1.3878076076507568, + "kl": 1.0364035405218601, + "learning_rate": 3.307803344688185e-07, + "loss": 0.1036, + "num_tokens": 26532460.0, + "reward": 0.74017333984375, + "reward_std": 0.0026006854604929686, + "rewards//mean": 0.74017333984375, + "rewards//std": 0.02354053407907486, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6142, + "grad_norm": 1.3807885646820068, + "kl": 1.0803507324308157, + "learning_rate": 3.3048176234133963e-07, + "loss": 0.108, + "num_tokens": 26541052.0, + "reward": 0.72808837890625, + "reward_std": 0.0037320968694984913, + "rewards//mean": 0.72808837890625, + "rewards//std": 0.03054145723581314, + "step": 3071 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6144, + "grad_norm": 1.6576881408691406, + "kl": 1.7159061916172504, + "learning_rate": 3.3018325849578656e-07, + "loss": 0.1716, + "num_tokens": 26549644.0, + "reward": 0.77374267578125, + "reward_std": 0.012323146685957909, + "rewards//mean": 0.77374267578125, + "rewards//std": 0.03296468406915665, + "step": 3072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6146, + "grad_norm": 1.1298251152038574, + "kl": 0.9207482524216175, + "learning_rate": 3.298848230523967e-07, + "loss": 0.0921, + "num_tokens": 26558252.0, + "reward": 0.77618408203125, + "reward_std": 0.006822366267442703, + "rewards//mean": 0.77618408203125, + "rewards//std": 0.02182888798415661, + "step": 3073 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6148, + "grad_norm": 1.851266860961914, + "kl": 1.0993687957525253, + "learning_rate": 3.295864561313797e-07, + "loss": 0.1099, + "num_tokens": 26566892.0, + "reward": 0.74017333984375, + "reward_std": 0.005774942226707935, + "rewards//mean": 0.74017333984375, + "rewards//std": 0.027081118896603584, + "step": 3074 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.615, + "grad_norm": 1.9921793937683105, + "kl": 1.063608292490244, + "learning_rate": 3.2928815785291786e-07, + "loss": 0.1064, + "num_tokens": 26575492.0, + "reward": 0.7684326171875, + "reward_std": 0.004690760280936956, + "rewards//mean": 0.7684326171875, + "rewards//std": 0.024513069540262222, + "step": 3075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6152, + "grad_norm": 1.6634228229522705, + "kl": 0.8500278852880001, + "learning_rate": 3.2898992833716563e-07, + "loss": 0.085, + "num_tokens": 26584188.0, + "reward": 0.73504638671875, + "reward_std": 0.004039994906634092, + "rewards//mean": 0.73504638671875, + "rewards//std": 0.032973866909742355, + "step": 3076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6154, + "grad_norm": 4.993829727172852, + "kl": 1.6191273536533117, + "learning_rate": 3.2869176770424973e-07, + "loss": 0.1619, + "num_tokens": 26592788.0, + "reward": 0.74664306640625, + "reward_std": 0.007761973887681961, + "rewards//mean": 0.74664306640625, + "rewards//std": 0.035467423498630524, + "step": 3077 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6156, + "grad_norm": 2.3756072521209717, + "kl": 1.0216530002653599, + "learning_rate": 3.2839367607426937e-07, + "loss": 0.1022, + "num_tokens": 26601436.0, + "reward": 0.77679443359375, + "reward_std": 0.009065305814146996, + "rewards//mean": 0.77679443359375, + "rewards//std": 0.023580368608236313, + "step": 3078 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6158, + "grad_norm": 2.9174551963806152, + "kl": 1.5090681836009026, + "learning_rate": 3.2809565356729575e-07, + "loss": 0.1509, + "num_tokens": 26610156.0, + "reward": 0.807861328125, + "reward_std": 0.007382662035524845, + "rewards//mean": 0.807861328125, + "rewards//std": 0.018280580639839172, + "step": 3079 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.616, + "grad_norm": 3.698621988296509, + "kl": 1.500789461657405, + "learning_rate": 3.2779770030337235e-07, + "loss": 0.1501, + "num_tokens": 26618708.0, + "reward": 0.7303466796875, + "reward_std": 0.004728636704385281, + "rewards//mean": 0.7303466796875, + "rewards//std": 0.032947514206171036, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6162, + "grad_norm": 1.6589545011520386, + "kl": 1.7610855177044868, + "learning_rate": 3.274998164025148e-07, + "loss": 0.1761, + "num_tokens": 26627380.0, + "reward": 0.7755126953125, + "reward_std": 0.011748873628675938, + "rewards//mean": 0.7755126953125, + "rewards//std": 0.02747161127626896, + "step": 3081 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6164, + "grad_norm": 5.202383518218994, + "kl": 2.484533565118909, + "learning_rate": 3.272020019847104e-07, + "loss": 0.2485, + "num_tokens": 26636020.0, + "reward": 0.75439453125, + "reward_std": 0.009634185582399368, + "rewards//mean": 0.75439453125, + "rewards//std": 0.02290969155728817, + "step": 3082 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6166, + "grad_norm": 5.143094062805176, + "kl": 1.4807845540344715, + "learning_rate": 3.2690425716991897e-07, + "loss": 0.1481, + "num_tokens": 26644740.0, + "reward": 0.7884521484375, + "reward_std": 0.004552271217107773, + "rewards//mean": 0.7884521484375, + "rewards//std": 0.01522011961787939, + "step": 3083 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6168, + "grad_norm": 1.7713041305541992, + "kl": 0.8311720471829176, + "learning_rate": 3.26606582078072e-07, + "loss": 0.0831, + "num_tokens": 26653348.0, + "reward": 0.74725341796875, + "reward_std": 0.00626820232719183, + "rewards//mean": 0.74725341796875, + "rewards//std": 0.026463106274604797, + "step": 3084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.617, + "grad_norm": 2.7567317485809326, + "kl": 1.6191201265901327, + "learning_rate": 3.263089768290731e-07, + "loss": 0.1619, + "num_tokens": 26661900.0, + "reward": 0.75640869140625, + "reward_std": 0.005338138435035944, + "rewards//mean": 0.75640869140625, + "rewards//std": 0.021274706348776817, + "step": 3085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6172, + "grad_norm": 1.5050052404403687, + "kl": 0.9748966041952372, + "learning_rate": 3.260114415427975e-07, + "loss": 0.0975, + "num_tokens": 26670588.0, + "reward": 0.76531982421875, + "reward_std": 0.004789609927684069, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.026459673419594765, + "step": 3086 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6174, + "grad_norm": 1.7413899898529053, + "kl": 1.839627893641591, + "learning_rate": 3.257139763390925e-07, + "loss": 0.184, + "num_tokens": 26679300.0, + "reward": 0.76226806640625, + "reward_std": 0.008983292616903782, + "rewards//mean": 0.76226806640625, + "rewards//std": 0.03491684049367905, + "step": 3087 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6176, + "grad_norm": 1.2620952129364014, + "kl": 1.3463593106716871, + "learning_rate": 3.254165813377769e-07, + "loss": 0.1346, + "num_tokens": 26687860.0, + "reward": 0.74835205078125, + "reward_std": 0.011344533413648605, + "rewards//mean": 0.74835205078125, + "rewards//std": 0.02648254670202732, + "step": 3088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6178, + "grad_norm": 3.150270462036133, + "kl": 1.2956474348902702, + "learning_rate": 3.251192566586416e-07, + "loss": 0.1296, + "num_tokens": 26696484.0, + "reward": 0.7996826171875, + "reward_std": 0.01063943188637495, + "rewards//mean": 0.7996826171875, + "rewards//std": 0.025031263008713722, + "step": 3089 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.618, + "grad_norm": 1.3744057416915894, + "kl": 1.3574109748005867, + "learning_rate": 3.2482200242144874e-07, + "loss": 0.1357, + "num_tokens": 26705132.0, + "reward": 0.793212890625, + "reward_std": 0.008448818698525429, + "rewards//mean": 0.793212890625, + "rewards//std": 0.02821025624871254, + "step": 3090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6182, + "grad_norm": 2.140544891357422, + "kl": 2.0010859295725822, + "learning_rate": 3.245248187459323e-07, + "loss": 0.2001, + "num_tokens": 26713892.0, + "reward": 0.7421875, + "reward_std": 0.011062685400247574, + "rewards//mean": 0.7421875, + "rewards//std": 0.029454562813043594, + "step": 3091 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6184, + "grad_norm": 2.2344956398010254, + "kl": 1.8325724210590124, + "learning_rate": 3.2422770575179793e-07, + "loss": 0.1833, + "num_tokens": 26722476.0, + "reward": 0.75244140625, + "reward_std": 0.014783394522964954, + "rewards//mean": 0.75244140625, + "rewards//std": 0.03212517872452736, + "step": 3092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6186, + "grad_norm": 1.1132185459136963, + "kl": 0.6581988055258989, + "learning_rate": 3.239306635587226e-07, + "loss": 0.0658, + "num_tokens": 26731084.0, + "reward": 0.75048828125, + "reward_std": 0.003540217876434326, + "rewards//mean": 0.75048828125, + "rewards//std": 0.018590956926345825, + "step": 3093 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6188, + "grad_norm": 1.1793066263198853, + "kl": 0.8928237054497004, + "learning_rate": 3.2363369228635504e-07, + "loss": 0.0893, + "num_tokens": 26739628.0, + "reward": 0.751220703125, + "reward_std": 0.0052474383264780045, + "rewards//mean": 0.751220703125, + "rewards//std": 0.02073865942656994, + "step": 3094 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.619, + "grad_norm": 11.197553634643555, + "kl": 3.683646809309721, + "learning_rate": 3.233367920543151e-07, + "loss": 0.3684, + "num_tokens": 26748420.0, + "reward": 0.75091552734375, + "reward_std": 0.026995355263352394, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.04979507625102997, + "step": 3095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6192, + "grad_norm": 0.6987686157226562, + "kl": 0.702866168692708, + "learning_rate": 3.2303996298219413e-07, + "loss": 0.0703, + "num_tokens": 26757028.0, + "reward": 0.76788330078125, + "reward_std": 0.0027312629390507936, + "rewards//mean": 0.76788330078125, + "rewards//std": 0.018776394426822662, + "step": 3096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6194, + "grad_norm": 1.8362094163894653, + "kl": 1.7290078289806843, + "learning_rate": 3.2274320518955493e-07, + "loss": 0.1729, + "num_tokens": 26765628.0, + "reward": 0.7501220703125, + "reward_std": 0.009627903811633587, + "rewards//mean": 0.7501220703125, + "rewards//std": 0.023083142936229706, + "step": 3097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6196, + "grad_norm": 9.476593971252441, + "kl": 2.7877780478447676, + "learning_rate": 3.2244651879593156e-07, + "loss": 0.2788, + "num_tokens": 26774292.0, + "reward": 0.771240234375, + "reward_std": 0.009171138517558575, + "rewards//mean": 0.771240234375, + "rewards//std": 0.030096158385276794, + "step": 3098 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6198, + "grad_norm": 1.639664888381958, + "kl": 0.5748798735439777, + "learning_rate": 3.221499039208291e-07, + "loss": 0.0575, + "num_tokens": 26782900.0, + "reward": 0.7384033203125, + "reward_std": 0.0022318409755825996, + "rewards//mean": 0.7384033203125, + "rewards//std": 0.03793619945645332, + "step": 3099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.62, + "grad_norm": 1.357810616493225, + "kl": 0.9070506114512682, + "learning_rate": 3.2185336068372415e-07, + "loss": 0.0907, + "num_tokens": 26791612.0, + "reward": 0.74139404296875, + "reward_std": 0.005463881883770227, + "rewards//mean": 0.74139404296875, + "rewards//std": 0.034626465290784836, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6202, + "grad_norm": 4.163914203643799, + "kl": 0.8537359219044447, + "learning_rate": 3.215568892040641e-07, + "loss": 0.0854, + "num_tokens": 26800340.0, + "reward": 0.75213623046875, + "reward_std": 0.002470265608280897, + "rewards//mean": 0.75213623046875, + "rewards//std": 0.030662648379802704, + "step": 3101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6204, + "grad_norm": 2.2612788677215576, + "kl": 1.0319197680801153, + "learning_rate": 3.2126048960126785e-07, + "loss": 0.1032, + "num_tokens": 26808876.0, + "reward": 0.74176025390625, + "reward_std": 0.003961701411753893, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.022842111065983772, + "step": 3102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6206, + "grad_norm": 1.6722384691238403, + "kl": 0.8546520341187716, + "learning_rate": 3.2096416199472494e-07, + "loss": 0.0855, + "num_tokens": 26817516.0, + "reward": 0.73297119140625, + "reward_std": 0.002677050419151783, + "rewards//mean": 0.73297119140625, + "rewards//std": 0.020650791004300117, + "step": 3103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6208, + "grad_norm": 4.805365085601807, + "kl": 1.5159847605973482, + "learning_rate": 3.2066790650379624e-07, + "loss": 0.1516, + "num_tokens": 26826140.0, + "reward": 0.76873779296875, + "reward_std": 0.010273633524775505, + "rewards//mean": 0.76873779296875, + "rewards//std": 0.02669547311961651, + "step": 3104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.621, + "grad_norm": 8.394627571105957, + "kl": 2.0254087038338184, + "learning_rate": 3.2037172324781326e-07, + "loss": 0.2025, + "num_tokens": 26834764.0, + "reward": 0.75811767578125, + "reward_std": 0.005880403332412243, + "rewards//mean": 0.75811767578125, + "rewards//std": 0.016838204115629196, + "step": 3105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6212, + "grad_norm": 6.545122146606445, + "kl": 2.9585273899137974, + "learning_rate": 3.2007561234607877e-07, + "loss": 0.2959, + "num_tokens": 26843428.0, + "reward": 0.72930908203125, + "reward_std": 0.011712558567523956, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.03581949695944786, + "step": 3106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6214, + "grad_norm": 0.7779846787452698, + "kl": 0.9487686809152365, + "learning_rate": 3.1977957391786614e-07, + "loss": 0.0949, + "num_tokens": 26852060.0, + "reward": 0.7794189453125, + "reward_std": 0.004274472594261169, + "rewards//mean": 0.7794189453125, + "rewards//std": 0.027943646535277367, + "step": 3107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6216, + "grad_norm": 2.04065203666687, + "kl": 1.7125043328851461, + "learning_rate": 3.1948360808241944e-07, + "loss": 0.1713, + "num_tokens": 26860660.0, + "reward": 0.73626708984375, + "reward_std": 0.009860499761998653, + "rewards//mean": 0.73626708984375, + "rewards//std": 0.04114013910293579, + "step": 3108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6218, + "grad_norm": 5.217604637145996, + "kl": 2.32062198035419, + "learning_rate": 3.191877149589539e-07, + "loss": 0.2321, + "num_tokens": 26869300.0, + "reward": 0.7391357421875, + "reward_std": 0.010035259649157524, + "rewards//mean": 0.7391357421875, + "rewards//std": 0.02584063820540905, + "step": 3109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.622, + "grad_norm": 1.903734803199768, + "kl": 0.5700356978923082, + "learning_rate": 3.188918946666551e-07, + "loss": 0.057, + "num_tokens": 26878012.0, + "reward": 0.7684326171875, + "reward_std": 0.001883823424577713, + "rewards//mean": 0.7684326171875, + "rewards//std": 0.029367836192250252, + "step": 3110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6222, + "grad_norm": 1.438845157623291, + "kl": 0.8769828472286463, + "learning_rate": 3.1859614732467954e-07, + "loss": 0.0877, + "num_tokens": 26886652.0, + "reward": 0.75433349609375, + "reward_std": 0.00394028052687645, + "rewards//mean": 0.75433349609375, + "rewards//std": 0.023241546005010605, + "step": 3111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6224, + "grad_norm": 3.110837936401367, + "kl": 0.9187993500381708, + "learning_rate": 3.1830047305215415e-07, + "loss": 0.0919, + "num_tokens": 26895292.0, + "reward": 0.79302978515625, + "reward_std": 0.004818592686206102, + "rewards//mean": 0.79302978515625, + "rewards//std": 0.016135813668370247, + "step": 3112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6226, + "grad_norm": 2.0209782123565674, + "kl": 1.9190478641539812, + "learning_rate": 3.1800487196817645e-07, + "loss": 0.1919, + "num_tokens": 26903956.0, + "reward": 0.76531982421875, + "reward_std": 0.012715555727481842, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.027580156922340393, + "step": 3113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6228, + "grad_norm": 1.541587471961975, + "kl": 1.26872250251472, + "learning_rate": 3.177093441918145e-07, + "loss": 0.1269, + "num_tokens": 26912612.0, + "reward": 0.75140380859375, + "reward_std": 0.007800497580319643, + "rewards//mean": 0.75140380859375, + "rewards//std": 0.03101167269051075, + "step": 3114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.623, + "grad_norm": 2.796955108642578, + "kl": 1.9468067102134228, + "learning_rate": 3.1741388984210703e-07, + "loss": 0.1947, + "num_tokens": 26921460.0, + "reward": 0.76666259765625, + "reward_std": 0.01209554448723793, + "rewards//mean": 0.76666259765625, + "rewards//std": 0.03314740210771561, + "step": 3115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6232, + "grad_norm": 14.915283203125, + "kl": 3.437970133498311, + "learning_rate": 3.1711850903806276e-07, + "loss": 0.3438, + "num_tokens": 26930180.0, + "reward": 0.7637939453125, + "reward_std": 0.012564756907522678, + "rewards//mean": 0.7637939453125, + "rewards//std": 0.04729151725769043, + "step": 3116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6234, + "grad_norm": 4.3052592277526855, + "kl": 1.7601696327328682, + "learning_rate": 3.168232018986613e-07, + "loss": 0.176, + "num_tokens": 26938932.0, + "reward": 0.76373291015625, + "reward_std": 0.005914856679737568, + "rewards//mean": 0.76373291015625, + "rewards//std": 0.037814848124980927, + "step": 3117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6236, + "grad_norm": 2.522041082382202, + "kl": 1.7439042888581753, + "learning_rate": 3.165279685428521e-07, + "loss": 0.1744, + "num_tokens": 26947508.0, + "reward": 0.76385498046875, + "reward_std": 0.011641548946499825, + "rewards//mean": 0.76385498046875, + "rewards//std": 0.032858893275260925, + "step": 3118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6238, + "grad_norm": 2.2746622562408447, + "kl": 1.4146077167242765, + "learning_rate": 3.1623280908955536e-07, + "loss": 0.1415, + "num_tokens": 26956220.0, + "reward": 0.76165771484375, + "reward_std": 0.009859396144747734, + "rewards//mean": 0.76165771484375, + "rewards//std": 0.029507897794246674, + "step": 3119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.624, + "grad_norm": 2.389737129211426, + "kl": 0.7305817622691393, + "learning_rate": 3.15937723657661e-07, + "loss": 0.0731, + "num_tokens": 26964796.0, + "reward": 0.748046875, + "reward_std": 0.005656369961798191, + "rewards//mean": 0.748046875, + "rewards//std": 0.02408980019390583, + "step": 3120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6242, + "grad_norm": 1.0637340545654297, + "kl": 0.9786658734083176, + "learning_rate": 3.156427123660297e-07, + "loss": 0.0979, + "num_tokens": 26973396.0, + "reward": 0.74853515625, + "reward_std": 0.00411025108769536, + "rewards//mean": 0.74853515625, + "rewards//std": 0.02980198711156845, + "step": 3121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6244, + "grad_norm": 1.2048319578170776, + "kl": 0.819572277367115, + "learning_rate": 3.1534777533349175e-07, + "loss": 0.082, + "num_tokens": 26982012.0, + "reward": 0.73681640625, + "reward_std": 0.0036867237649858, + "rewards//mean": 0.73681640625, + "rewards//std": 0.03492816165089607, + "step": 3122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6246, + "grad_norm": 2.061579942703247, + "kl": 0.9369387999176979, + "learning_rate": 3.150529126788477e-07, + "loss": 0.0937, + "num_tokens": 26990668.0, + "reward": 0.73028564453125, + "reward_std": 0.0034408888313919306, + "rewards//mean": 0.73028564453125, + "rewards//std": 0.02386179380118847, + "step": 3123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6248, + "grad_norm": 1.8927639722824097, + "kl": 1.4749672934412956, + "learning_rate": 3.147581245208685e-07, + "loss": 0.1475, + "num_tokens": 26999252.0, + "reward": 0.771728515625, + "reward_std": 0.012899941764771938, + "rewards//mean": 0.771728515625, + "rewards//std": 0.0359220989048481, + "step": 3124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.625, + "grad_norm": 5.498333930969238, + "kl": 2.052042596042156, + "learning_rate": 3.144634109782944e-07, + "loss": 0.2052, + "num_tokens": 27007908.0, + "reward": 0.72674560546875, + "reward_std": 0.010197717696428299, + "rewards//mean": 0.72674560546875, + "rewards//std": 0.0300523042678833, + "step": 3125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6252, + "grad_norm": 4.758462905883789, + "kl": 3.0127104371786118, + "learning_rate": 3.141687721698363e-07, + "loss": 0.3013, + "num_tokens": 27016460.0, + "reward": 0.76214599609375, + "reward_std": 0.018529217690229416, + "rewards//mean": 0.76214599609375, + "rewards//std": 0.03231773525476456, + "step": 3126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6254, + "grad_norm": 7.580872535705566, + "kl": 2.2597966212779284, + "learning_rate": 3.138742082141744e-07, + "loss": 0.226, + "num_tokens": 27025108.0, + "reward": 0.739501953125, + "reward_std": 0.012235064059495926, + "rewards//mean": 0.739501953125, + "rewards//std": 0.031603556126356125, + "step": 3127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6256, + "grad_norm": 3.2919862270355225, + "kl": 1.676285794004798, + "learning_rate": 3.1357971922995935e-07, + "loss": 0.1676, + "num_tokens": 27033660.0, + "reward": 0.7835693359375, + "reward_std": 0.012150602415204048, + "rewards//mean": 0.7835693359375, + "rewards//std": 0.03021537885069847, + "step": 3128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6258, + "grad_norm": 1.379388451576233, + "kl": 0.8798391222953796, + "learning_rate": 3.13285305335811e-07, + "loss": 0.088, + "num_tokens": 27042364.0, + "reward": 0.76348876953125, + "reward_std": 0.005756567697972059, + "rewards//mean": 0.76348876953125, + "rewards//std": 0.026392078027129173, + "step": 3129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.626, + "grad_norm": 7.661942958831787, + "kl": 2.413809645920992, + "learning_rate": 3.129909666503194e-07, + "loss": 0.2414, + "num_tokens": 27050964.0, + "reward": 0.76422119140625, + "reward_std": 0.01283988170325756, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.033038534224033356, + "step": 3130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6262, + "grad_norm": 1.2010220289230347, + "kl": 1.024215029552579, + "learning_rate": 3.1269670329204393e-07, + "loss": 0.1024, + "num_tokens": 27059604.0, + "reward": 0.7352294921875, + "reward_std": 0.005371796898543835, + "rewards//mean": 0.7352294921875, + "rewards//std": 0.03005261905491352, + "step": 3131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6264, + "grad_norm": 1.2135000228881836, + "kl": 0.9728917330503464, + "learning_rate": 3.124025153795141e-07, + "loss": 0.0973, + "num_tokens": 27068236.0, + "reward": 0.75, + "reward_std": 0.006383282132446766, + "rewards//mean": 0.75, + "rewards//std": 0.03166576102375984, + "step": 3132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6266, + "grad_norm": 1.7401000261306763, + "kl": 1.906726136803627, + "learning_rate": 3.121084030312286e-07, + "loss": 0.1907, + "num_tokens": 27077036.0, + "reward": 0.75408935546875, + "reward_std": 0.013292869552969933, + "rewards//mean": 0.75408935546875, + "rewards//std": 0.03662027046084404, + "step": 3133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6268, + "grad_norm": 8.339385032653809, + "kl": 1.1240004897117615, + "learning_rate": 3.1181436636565596e-07, + "loss": 0.1124, + "num_tokens": 27085676.0, + "reward": 0.769287109375, + "reward_std": 0.00856110081076622, + "rewards//mean": 0.769287109375, + "rewards//std": 0.03352255001664162, + "step": 3134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.627, + "grad_norm": 0.8577738404273987, + "kl": 0.6520977467298508, + "learning_rate": 3.1152040550123393e-07, + "loss": 0.0652, + "num_tokens": 27094244.0, + "reward": 0.7161865234375, + "reward_std": 0.002557160099968314, + "rewards//mean": 0.7161865234375, + "rewards//std": 0.03721589222550392, + "step": 3135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6272, + "grad_norm": 6.162028789520264, + "kl": 2.551443722099066, + "learning_rate": 3.112265205563701e-07, + "loss": 0.2551, + "num_tokens": 27102876.0, + "reward": 0.74078369140625, + "reward_std": 0.007049069739878178, + "rewards//mean": 0.74078369140625, + "rewards//std": 0.018087342381477356, + "step": 3136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6274, + "grad_norm": 1.6475263833999634, + "kl": 2.1505930833518505, + "learning_rate": 3.109327116494411e-07, + "loss": 0.2151, + "num_tokens": 27111572.0, + "reward": 0.7642822265625, + "reward_std": 0.014820747077465057, + "rewards//mean": 0.7642822265625, + "rewards//std": 0.026831572875380516, + "step": 3137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6276, + "grad_norm": 7.623717308044434, + "kl": 2.237755123525858, + "learning_rate": 3.106389788987934e-07, + "loss": 0.2238, + "num_tokens": 27120268.0, + "reward": 0.75445556640625, + "reward_std": 0.011526870541274548, + "rewards//mean": 0.75445556640625, + "rewards//std": 0.03365949168801308, + "step": 3138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6278, + "grad_norm": 3.597714424133301, + "kl": 1.8729159887880087, + "learning_rate": 3.103453224227424e-07, + "loss": 0.1873, + "num_tokens": 27128868.0, + "reward": 0.7149658203125, + "reward_std": 0.00821800995618105, + "rewards//mean": 0.7149658203125, + "rewards//std": 0.03675585985183716, + "step": 3139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.628, + "grad_norm": 5.554637432098389, + "kl": 2.5638196859508753, + "learning_rate": 3.1005174233957267e-07, + "loss": 0.2564, + "num_tokens": 27137620.0, + "reward": 0.7008056640625, + "reward_std": 0.007781898602843285, + "rewards//mean": 0.7008056640625, + "rewards//std": 0.04020959138870239, + "step": 3140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6282, + "grad_norm": 3.158529758453369, + "kl": 1.7671424578875303, + "learning_rate": 3.097582387675385e-07, + "loss": 0.1767, + "num_tokens": 27146228.0, + "reward": 0.76055908203125, + "reward_std": 0.012225020676851273, + "rewards//mean": 0.76055908203125, + "rewards//std": 0.019409088417887688, + "step": 3141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6284, + "grad_norm": 1.5752569437026978, + "kl": 0.8533081989735365, + "learning_rate": 3.0946481182486297e-07, + "loss": 0.0853, + "num_tokens": 27154900.0, + "reward": 0.7677001953125, + "reward_std": 0.0038587411399930716, + "rewards//mean": 0.7677001953125, + "rewards//std": 0.026282083243131638, + "step": 3142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6286, + "grad_norm": 6.716192722320557, + "kl": 1.98455461114645, + "learning_rate": 3.0917146162973846e-07, + "loss": 0.1985, + "num_tokens": 27163476.0, + "reward": 0.724365234375, + "reward_std": 0.010570337995886803, + "rewards//mean": 0.724365234375, + "rewards//std": 0.03642427176237106, + "step": 3143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6288, + "grad_norm": 6.3711652755737305, + "kl": 2.782665519043803, + "learning_rate": 3.088781883003263e-07, + "loss": 0.2783, + "num_tokens": 27172196.0, + "reward": 0.76275634765625, + "reward_std": 0.013066920451819897, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.04311360791325569, + "step": 3144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.629, + "grad_norm": 9.437460899353027, + "kl": 2.107087839394808, + "learning_rate": 3.085849919547572e-07, + "loss": 0.2107, + "num_tokens": 27180900.0, + "reward": 0.78607177734375, + "reward_std": 0.009613605216145515, + "rewards//mean": 0.78607177734375, + "rewards//std": 0.03897399827837944, + "step": 3145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6292, + "grad_norm": 3.126075506210327, + "kl": 0.7504993639886379, + "learning_rate": 3.0829187271113035e-07, + "loss": 0.075, + "num_tokens": 27189596.0, + "reward": 0.76080322265625, + "reward_std": 0.005066409707069397, + "rewards//mean": 0.76080322265625, + "rewards//std": 0.024505889043211937, + "step": 3146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6294, + "grad_norm": 2.2148725986480713, + "kl": 1.9243103861808777, + "learning_rate": 3.079988306875143e-07, + "loss": 0.1924, + "num_tokens": 27198212.0, + "reward": 0.790283203125, + "reward_std": 0.008957324549555779, + "rewards//mean": 0.790283203125, + "rewards//std": 0.02913106232881546, + "step": 3147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6296, + "grad_norm": 2.6910064220428467, + "kl": 2.2351403143256903, + "learning_rate": 3.0770586600194614e-07, + "loss": 0.2235, + "num_tokens": 27206828.0, + "reward": 0.73382568359375, + "reward_std": 0.011133957654237747, + "rewards//mean": 0.73382568359375, + "rewards//std": 0.034003131091594696, + "step": 3148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6298, + "grad_norm": 1.614180088043213, + "kl": 1.5879469960927963, + "learning_rate": 3.0741297877243235e-07, + "loss": 0.1588, + "num_tokens": 27215388.0, + "reward": 0.777099609375, + "reward_std": 0.012235360220074654, + "rewards//mean": 0.777099609375, + "rewards//std": 0.030432282015681267, + "step": 3149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.63, + "grad_norm": 2.065922975540161, + "kl": 1.2229218669235706, + "learning_rate": 3.0712016911694755e-07, + "loss": 0.1223, + "num_tokens": 27224004.0, + "reward": 0.79766845703125, + "reward_std": 0.005130096338689327, + "rewards//mean": 0.79766845703125, + "rewards//std": 0.020113827660679817, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6302, + "grad_norm": 1.8785799741744995, + "kl": 0.8274429365992546, + "learning_rate": 3.068274371534356e-07, + "loss": 0.0827, + "num_tokens": 27232636.0, + "reward": 0.7532958984375, + "reward_std": 0.0033139195293188095, + "rewards//mean": 0.7532958984375, + "rewards//std": 0.01985982060432434, + "step": 3151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6304, + "grad_norm": 8.772117614746094, + "kl": 2.204254474490881, + "learning_rate": 3.065347829998089e-07, + "loss": 0.2204, + "num_tokens": 27241308.0, + "reward": 0.7584228515625, + "reward_std": 0.008398022502660751, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.030442973598837852, + "step": 3152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6306, + "grad_norm": 4.7381720542907715, + "kl": 1.3675116952508688, + "learning_rate": 3.0624220677394854e-07, + "loss": 0.1368, + "num_tokens": 27249924.0, + "reward": 0.7928466796875, + "reward_std": 0.006301639601588249, + "rewards//mean": 0.7928466796875, + "rewards//std": 0.02940492518246174, + "step": 3153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6308, + "grad_norm": 2.4788644313812256, + "kl": 1.8945719357579947, + "learning_rate": 3.0594970859370404e-07, + "loss": 0.1895, + "num_tokens": 27258492.0, + "reward": 0.775146484375, + "reward_std": 0.011018604971468449, + "rewards//mean": 0.775146484375, + "rewards//std": 0.026308851316571236, + "step": 3154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.631, + "grad_norm": 2.99969482421875, + "kl": 0.9052226021885872, + "learning_rate": 3.0565728857689366e-07, + "loss": 0.0905, + "num_tokens": 27267172.0, + "reward": 0.748046875, + "reward_std": 0.00533553259447217, + "rewards//mean": 0.748046875, + "rewards//std": 0.042971570044755936, + "step": 3155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6312, + "grad_norm": 2.036201238632202, + "kl": 0.7937064673751593, + "learning_rate": 3.053649468413043e-07, + "loss": 0.0794, + "num_tokens": 27275772.0, + "reward": 0.764404296875, + "reward_std": 0.0024798414669930935, + "rewards//mean": 0.764404296875, + "rewards//std": 0.025285478681325912, + "step": 3156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6314, + "grad_norm": 3.777538299560547, + "kl": 1.2024687584489584, + "learning_rate": 3.0507268350469093e-07, + "loss": 0.1202, + "num_tokens": 27284356.0, + "reward": 0.72576904296875, + "reward_std": 0.0044524529948830605, + "rewards//mean": 0.72576904296875, + "rewards//std": 0.03463345766067505, + "step": 3157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6316, + "grad_norm": 1.8396525382995605, + "kl": 1.1621283646672964, + "learning_rate": 3.0478049868477745e-07, + "loss": 0.1162, + "num_tokens": 27292972.0, + "reward": 0.7869873046875, + "reward_std": 0.007918967865407467, + "rewards//mean": 0.7869873046875, + "rewards//std": 0.02783944085240364, + "step": 3158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6318, + "grad_norm": 1.0362017154693604, + "kl": 1.504127511754632, + "learning_rate": 3.0448839249925566e-07, + "loss": 0.1504, + "num_tokens": 27301564.0, + "reward": 0.7421875, + "reward_std": 0.008791664615273476, + "rewards//mean": 0.7421875, + "rewards//std": 0.040078550577163696, + "step": 3159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.632, + "grad_norm": 3.70186185836792, + "kl": 1.7285435479134321, + "learning_rate": 3.0419636506578617e-07, + "loss": 0.1729, + "num_tokens": 27310204.0, + "reward": 0.7513427734375, + "reward_std": 0.005594527814537287, + "rewards//mean": 0.7513427734375, + "rewards//std": 0.030010277405381203, + "step": 3160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6322, + "grad_norm": 8.400199890136719, + "kl": 1.6542526688426733, + "learning_rate": 3.039044165019972e-07, + "loss": 0.1654, + "num_tokens": 27318796.0, + "reward": 0.7462158203125, + "reward_std": 0.006825732067227364, + "rewards//mean": 0.7462158203125, + "rewards//std": 0.03151986002922058, + "step": 3161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6324, + "grad_norm": 2.9115359783172607, + "kl": 1.54690745100379, + "learning_rate": 3.03612546925486e-07, + "loss": 0.1547, + "num_tokens": 27327364.0, + "reward": 0.75164794921875, + "reward_std": 0.008320080116391182, + "rewards//mean": 0.75164794921875, + "rewards//std": 0.0391480028629303, + "step": 3162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6326, + "grad_norm": 1.4277085065841675, + "kl": 1.6228833571076393, + "learning_rate": 3.0332075645381726e-07, + "loss": 0.1623, + "num_tokens": 27336020.0, + "reward": 0.7691650390625, + "reward_std": 0.01331576332449913, + "rewards//mean": 0.7691650390625, + "rewards//std": 0.030920563265681267, + "step": 3163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6328, + "grad_norm": 24.6442928314209, + "kl": 0.6501802746206522, + "learning_rate": 3.0302904520452443e-07, + "loss": 0.065, + "num_tokens": 27344644.0, + "reward": 0.76312255859375, + "reward_std": 0.0018989683594554663, + "rewards//mean": 0.76312255859375, + "rewards//std": 0.02014015056192875, + "step": 3164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.633, + "grad_norm": 2.445739984512329, + "kl": 1.0878155399113894, + "learning_rate": 3.027374132951085e-07, + "loss": 0.1088, + "num_tokens": 27353380.0, + "reward": 0.7657470703125, + "reward_std": 0.008215473964810371, + "rewards//mean": 0.7657470703125, + "rewards//std": 0.028805769979953766, + "step": 3165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6332, + "grad_norm": 6.0225019454956055, + "kl": 1.1758133918046951, + "learning_rate": 3.02445860843039e-07, + "loss": 0.1176, + "num_tokens": 27361980.0, + "reward": 0.709716796875, + "reward_std": 0.004372308496385813, + "rewards//mean": 0.709716796875, + "rewards//std": 0.03481268882751465, + "step": 3166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6334, + "grad_norm": 3.760218381881714, + "kl": 1.5501990523189306, + "learning_rate": 3.0215438796575327e-07, + "loss": 0.155, + "num_tokens": 27370644.0, + "reward": 0.75994873046875, + "reward_std": 0.011293101124465466, + "rewards//mean": 0.75994873046875, + "rewards//std": 0.0349033959209919, + "step": 3167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6336, + "grad_norm": 0.9934365749359131, + "kl": 0.646508727222681, + "learning_rate": 3.018629947806563e-07, + "loss": 0.0647, + "num_tokens": 27379356.0, + "reward": 0.72698974609375, + "reward_std": 0.0034567639231681824, + "rewards//mean": 0.72698974609375, + "rewards//std": 0.02088114432990551, + "step": 3168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6338, + "grad_norm": 3.340820074081421, + "kl": 1.812185550108552, + "learning_rate": 3.015716814051212e-07, + "loss": 0.1812, + "num_tokens": 27388020.0, + "reward": 0.7200927734375, + "reward_std": 0.012522836215794086, + "rewards//mean": 0.7200927734375, + "rewards//std": 0.027305802330374718, + "step": 3169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.634, + "grad_norm": 4.530235767364502, + "kl": 1.0209522172808647, + "learning_rate": 3.0128044795648923e-07, + "loss": 0.1021, + "num_tokens": 27396596.0, + "reward": 0.72540283203125, + "reward_std": 0.004767494276165962, + "rewards//mean": 0.72540283203125, + "rewards//std": 0.02541736513376236, + "step": 3170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6342, + "grad_norm": 0.9173640608787537, + "kl": 1.0522732343524694, + "learning_rate": 3.00989294552069e-07, + "loss": 0.1052, + "num_tokens": 27405252.0, + "reward": 0.7158203125, + "reward_std": 0.006510760169476271, + "rewards//mean": 0.7158203125, + "rewards//std": 0.029968129470944405, + "step": 3171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6344, + "grad_norm": 5.577796936035156, + "kl": 1.621506566181779, + "learning_rate": 3.0069822130913716e-07, + "loss": 0.1622, + "num_tokens": 27413908.0, + "reward": 0.7481689453125, + "reward_std": 0.00740332854911685, + "rewards//mean": 0.7481689453125, + "rewards//std": 0.02704511024057865, + "step": 3172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6346, + "grad_norm": 1.3179950714111328, + "kl": 1.439491981640458, + "learning_rate": 3.004072283449379e-07, + "loss": 0.1439, + "num_tokens": 27422580.0, + "reward": 0.73431396484375, + "reward_std": 0.006755572743713856, + "rewards//mean": 0.73431396484375, + "rewards//std": 0.025915732607245445, + "step": 3173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6348, + "grad_norm": 1.3385539054870605, + "kl": 1.2795591689646244, + "learning_rate": 3.0011631577668325e-07, + "loss": 0.128, + "num_tokens": 27431220.0, + "reward": 0.7489013671875, + "reward_std": 0.008281301707029343, + "rewards//mean": 0.7489013671875, + "rewards//std": 0.03137738257646561, + "step": 3174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.635, + "grad_norm": 2.0310118198394775, + "kl": 1.1529672890901566, + "learning_rate": 2.9982548372155256e-07, + "loss": 0.1153, + "num_tokens": 27439796.0, + "reward": 0.7442626953125, + "reward_std": 0.006387750152498484, + "rewards//mean": 0.7442626953125, + "rewards//std": 0.03805094584822655, + "step": 3175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6352, + "grad_norm": 2.385164976119995, + "kl": 0.9767811447381973, + "learning_rate": 2.9953473229669324e-07, + "loss": 0.0977, + "num_tokens": 27448500.0, + "reward": 0.759033203125, + "reward_std": 0.0031812458764761686, + "rewards//mean": 0.759033203125, + "rewards//std": 0.02960118092596531, + "step": 3176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6354, + "grad_norm": 4.116273403167725, + "kl": 0.9504358042031527, + "learning_rate": 2.9924406161921966e-07, + "loss": 0.095, + "num_tokens": 27457132.0, + "reward": 0.77362060546875, + "reward_std": 0.007713501341640949, + "rewards//mean": 0.77362060546875, + "rewards//std": 0.027930304408073425, + "step": 3177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6356, + "grad_norm": 2.7953412532806396, + "kl": 1.0363605562597513, + "learning_rate": 2.989534718062142e-07, + "loss": 0.1036, + "num_tokens": 27465716.0, + "reward": 0.74749755859375, + "reward_std": 0.007248516194522381, + "rewards//mean": 0.74749755859375, + "rewards//std": 0.02036736160516739, + "step": 3178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6358, + "grad_norm": 0.8113217949867249, + "kl": 0.6432873737066984, + "learning_rate": 2.9866296297472613e-07, + "loss": 0.0643, + "num_tokens": 27474396.0, + "reward": 0.76629638671875, + "reward_std": 0.002399325603619218, + "rewards//mean": 0.76629638671875, + "rewards//std": 0.026213670149445534, + "step": 3179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.636, + "grad_norm": 1.290604829788208, + "kl": 0.9276379011571407, + "learning_rate": 2.9837253524177256e-07, + "loss": 0.0928, + "num_tokens": 27483092.0, + "reward": 0.70654296875, + "reward_std": 0.0047340402379632, + "rewards//mean": 0.70654296875, + "rewards//std": 0.01913031004369259, + "step": 3180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6362, + "grad_norm": 2.0475006103515625, + "kl": 1.6169743947684765, + "learning_rate": 2.9808218872433766e-07, + "loss": 0.1617, + "num_tokens": 27491740.0, + "reward": 0.7880859375, + "reward_std": 0.013932289555668831, + "rewards//mean": 0.7880859375, + "rewards//std": 0.030161473900079727, + "step": 3181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6364, + "grad_norm": 4.084904670715332, + "kl": 1.594319574534893, + "learning_rate": 2.97791923539373e-07, + "loss": 0.1594, + "num_tokens": 27500348.0, + "reward": 0.7547607421875, + "reward_std": 0.0063932365737855434, + "rewards//mean": 0.7547607421875, + "rewards//std": 0.020931558683514595, + "step": 3182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6366, + "grad_norm": 1.3759980201721191, + "kl": 0.8413648251444101, + "learning_rate": 2.9750173980379733e-07, + "loss": 0.0841, + "num_tokens": 27508980.0, + "reward": 0.73907470703125, + "reward_std": 0.003617936512455344, + "rewards//mean": 0.73907470703125, + "rewards//std": 0.03160589188337326, + "step": 3183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6368, + "grad_norm": 5.2810773849487305, + "kl": 1.6838479936122894, + "learning_rate": 2.9721163763449677e-07, + "loss": 0.1684, + "num_tokens": 27517612.0, + "reward": 0.7344970703125, + "reward_std": 0.004730177111923695, + "rewards//mean": 0.7344970703125, + "rewards//std": 0.02001471072435379, + "step": 3184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.637, + "grad_norm": 4.291467666625977, + "kl": 2.8122759610414505, + "learning_rate": 2.969216171483242e-07, + "loss": 0.2812, + "num_tokens": 27526204.0, + "reward": 0.74444580078125, + "reward_std": 0.011747878044843674, + "rewards//mean": 0.74444580078125, + "rewards//std": 0.03134908527135849, + "step": 3185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6372, + "grad_norm": 2.1183671951293945, + "kl": 1.512441173195839, + "learning_rate": 2.9663167846209996e-07, + "loss": 0.1512, + "num_tokens": 27534764.0, + "reward": 0.74627685546875, + "reward_std": 0.006587772164493799, + "rewards//mean": 0.74627685546875, + "rewards//std": 0.027947641909122467, + "step": 3186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6374, + "grad_norm": 2.2507266998291016, + "kl": 1.7769907414913177, + "learning_rate": 2.9634182169261133e-07, + "loss": 0.1777, + "num_tokens": 27543372.0, + "reward": 0.77142333984375, + "reward_std": 0.018290365114808083, + "rewards//mean": 0.77142333984375, + "rewards//std": 0.0319095216691494, + "step": 3187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6376, + "grad_norm": 7.5387420654296875, + "kl": 1.768872519955039, + "learning_rate": 2.9605204695661256e-07, + "loss": 0.1769, + "num_tokens": 27552020.0, + "reward": 0.78692626953125, + "reward_std": 0.009006926789879799, + "rewards//mean": 0.78692626953125, + "rewards//std": 0.02768205665051937, + "step": 3188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6378, + "grad_norm": 3.357470989227295, + "kl": 1.4807263165712357, + "learning_rate": 2.9576235437082495e-07, + "loss": 0.1481, + "num_tokens": 27560604.0, + "reward": 0.74835205078125, + "reward_std": 0.004167427308857441, + "rewards//mean": 0.74835205078125, + "rewards//std": 0.027602102607488632, + "step": 3189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.638, + "grad_norm": 1.544311285018921, + "kl": 1.396449888125062, + "learning_rate": 2.9547274405193645e-07, + "loss": 0.1396, + "num_tokens": 27569244.0, + "reward": 0.748779296875, + "reward_std": 0.010326618328690529, + "rewards//mean": 0.748779296875, + "rewards//std": 0.030216630548238754, + "step": 3190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6382, + "grad_norm": 3.0462586879730225, + "kl": 0.5273773763328791, + "learning_rate": 2.9518321611660234e-07, + "loss": 0.0527, + "num_tokens": 27577876.0, + "reward": 0.775146484375, + "reward_std": 0.0020117778331041336, + "rewards//mean": 0.775146484375, + "rewards//std": 0.024278830736875534, + "step": 3191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6384, + "grad_norm": 5.227640628814697, + "kl": 2.4195780493319035, + "learning_rate": 2.948937706814442e-07, + "loss": 0.242, + "num_tokens": 27586612.0, + "reward": 0.75384521484375, + "reward_std": 0.007949243299663067, + "rewards//mean": 0.75384521484375, + "rewards//std": 0.03499046340584755, + "step": 3192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6386, + "grad_norm": 3.1931231021881104, + "kl": 2.8693448677659035, + "learning_rate": 2.9460440786305077e-07, + "loss": 0.2869, + "num_tokens": 27595332.0, + "reward": 0.75189208984375, + "reward_std": 0.01898665726184845, + "rewards//mean": 0.75189208984375, + "rewards//std": 0.03954925388097763, + "step": 3193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6388, + "grad_norm": 1.316381573677063, + "kl": 0.9640701655298471, + "learning_rate": 2.943151277779771e-07, + "loss": 0.0964, + "num_tokens": 27603852.0, + "reward": 0.74993896484375, + "reward_std": 0.006645615212619305, + "rewards//mean": 0.74993896484375, + "rewards//std": 0.030469007790088654, + "step": 3194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.639, + "grad_norm": 9.700461387634277, + "kl": 2.4781790282577276, + "learning_rate": 2.9402593054274557e-07, + "loss": 0.2478, + "num_tokens": 27612612.0, + "reward": 0.76165771484375, + "reward_std": 0.009859190322458744, + "rewards//mean": 0.76165771484375, + "rewards//std": 0.03443928435444832, + "step": 3195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6392, + "grad_norm": 1.7450716495513916, + "kl": 1.5546567905694246, + "learning_rate": 2.9373681627384445e-07, + "loss": 0.1555, + "num_tokens": 27621188.0, + "reward": 0.72003173828125, + "reward_std": 0.0082898810505867, + "rewards//mean": 0.72003173828125, + "rewards//std": 0.03446432948112488, + "step": 3196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6394, + "grad_norm": 3.1519744396209717, + "kl": 1.8166353348642588, + "learning_rate": 2.9344778508772914e-07, + "loss": 0.1817, + "num_tokens": 27629764.0, + "reward": 0.771240234375, + "reward_std": 0.007098815869539976, + "rewards//mean": 0.771240234375, + "rewards//std": 0.01786513440310955, + "step": 3197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6396, + "grad_norm": 1.9578877687454224, + "kl": 2.320758419111371, + "learning_rate": 2.9315883710082125e-07, + "loss": 0.2321, + "num_tokens": 27638532.0, + "reward": 0.74432373046875, + "reward_std": 0.01311455387622118, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.03347956761717796, + "step": 3198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6398, + "grad_norm": 0.851631760597229, + "kl": 0.6492875460535288, + "learning_rate": 2.9286997242950913e-07, + "loss": 0.0649, + "num_tokens": 27647124.0, + "reward": 0.79296875, + "reward_std": 0.0029260218143463135, + "rewards//mean": 0.79296875, + "rewards//std": 0.02970837987959385, + "step": 3199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.64, + "grad_norm": 11.056718826293945, + "kl": 3.333649903535843, + "learning_rate": 2.925811911901473e-07, + "loss": 0.3334, + "num_tokens": 27655900.0, + "reward": 0.75616455078125, + "reward_std": 0.01273740828037262, + "rewards//mean": 0.75616455078125, + "rewards//std": 0.022507645189762115, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6402, + "grad_norm": 3.682708740234375, + "kl": 1.9705723728984594, + "learning_rate": 2.922924934990568e-07, + "loss": 0.1971, + "num_tokens": 27664524.0, + "reward": 0.7333984375, + "reward_std": 0.009780299849808216, + "rewards//mean": 0.7333984375, + "rewards//std": 0.04262068122625351, + "step": 3201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6404, + "grad_norm": 2.2887749671936035, + "kl": 2.2216028925031424, + "learning_rate": 2.920038794725252e-07, + "loss": 0.2222, + "num_tokens": 27673132.0, + "reward": 0.7496337890625, + "reward_std": 0.01457863487303257, + "rewards//mean": 0.7496337890625, + "rewards//std": 0.032827842980623245, + "step": 3202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6406, + "grad_norm": 2.242955207824707, + "kl": 1.4167871549725533, + "learning_rate": 2.9171534922680597e-07, + "loss": 0.1417, + "num_tokens": 27681772.0, + "reward": 0.7420654296875, + "reward_std": 0.007878679782152176, + "rewards//mean": 0.7420654296875, + "rewards//std": 0.031187692657113075, + "step": 3203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6408, + "grad_norm": 2.297172784805298, + "kl": 1.25296201556921, + "learning_rate": 2.914269028781191e-07, + "loss": 0.1253, + "num_tokens": 27690356.0, + "reward": 0.7486572265625, + "reward_std": 0.01139303669333458, + "rewards//mean": 0.7486572265625, + "rewards//std": 0.0341406911611557, + "step": 3204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.641, + "grad_norm": 6.0641889572143555, + "kl": 1.6360020842403173, + "learning_rate": 2.9113854054265107e-07, + "loss": 0.1636, + "num_tokens": 27699028.0, + "reward": 0.747802734375, + "reward_std": 0.008586332201957703, + "rewards//mean": 0.747802734375, + "rewards//std": 0.033939018845558167, + "step": 3205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6412, + "grad_norm": 4.708662509918213, + "kl": 1.6799761708825827, + "learning_rate": 2.9085026233655365e-07, + "loss": 0.168, + "num_tokens": 27707668.0, + "reward": 0.768798828125, + "reward_std": 0.009926151484251022, + "rewards//mean": 0.768798828125, + "rewards//std": 0.04073714092373848, + "step": 3206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6414, + "grad_norm": 8.093809127807617, + "kl": 2.8707738015800714, + "learning_rate": 2.9056206837594563e-07, + "loss": 0.2871, + "num_tokens": 27716428.0, + "reward": 0.74468994140625, + "reward_std": 0.011815742589533329, + "rewards//mean": 0.74468994140625, + "rewards//std": 0.035113975405693054, + "step": 3207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6416, + "grad_norm": 8.044500350952148, + "kl": 2.528752513229847, + "learning_rate": 2.902739587769114e-07, + "loss": 0.2529, + "num_tokens": 27725004.0, + "reward": 0.788818359375, + "reward_std": 0.013415473513305187, + "rewards//mean": 0.788818359375, + "rewards//std": 0.03732413798570633, + "step": 3208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6418, + "grad_norm": 1.558612585067749, + "kl": 1.7412698231637478, + "learning_rate": 2.8998593365550173e-07, + "loss": 0.1741, + "num_tokens": 27733788.0, + "reward": 0.73260498046875, + "reward_std": 0.01013216469436884, + "rewards//mean": 0.73260498046875, + "rewards//std": 0.04476013034582138, + "step": 3209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.642, + "grad_norm": 5.734182834625244, + "kl": 2.5709457620978355, + "learning_rate": 2.896979931277326e-07, + "loss": 0.2571, + "num_tokens": 27742476.0, + "reward": 0.7342529296875, + "reward_std": 0.012070484459400177, + "rewards//mean": 0.7342529296875, + "rewards//std": 0.03645148128271103, + "step": 3210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6422, + "grad_norm": 2.6067819595336914, + "kl": 1.6942967772483826, + "learning_rate": 2.894101373095867e-07, + "loss": 0.1694, + "num_tokens": 27751092.0, + "reward": 0.74505615234375, + "reward_std": 0.007338922005146742, + "rewards//mean": 0.74505615234375, + "rewards//std": 0.03020353987812996, + "step": 3211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6424, + "grad_norm": 4.850281238555908, + "kl": 0.7114821635186672, + "learning_rate": 2.891223663170123e-07, + "loss": 0.0711, + "num_tokens": 27759852.0, + "reward": 0.755859375, + "reward_std": 0.003267057007178664, + "rewards//mean": 0.755859375, + "rewards//std": 0.030313663184642792, + "step": 3212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6426, + "grad_norm": 8.045666694641113, + "kl": 1.6360849943012, + "learning_rate": 2.888346802659238e-07, + "loss": 0.1636, + "num_tokens": 27768508.0, + "reward": 0.759521484375, + "reward_std": 0.0027669956907629967, + "rewards//mean": 0.759521484375, + "rewards//std": 0.033774483948946, + "step": 3213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6428, + "grad_norm": 4.150081157684326, + "kl": 1.0186133608222008, + "learning_rate": 2.8854707927220057e-07, + "loss": 0.1019, + "num_tokens": 27777124.0, + "reward": 0.76104736328125, + "reward_std": 0.001957073574885726, + "rewards//mean": 0.76104736328125, + "rewards//std": 0.030405845493078232, + "step": 3214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.643, + "grad_norm": 3.089136838912964, + "kl": 2.214523918926716, + "learning_rate": 2.8825956345168854e-07, + "loss": 0.2215, + "num_tokens": 27785796.0, + "reward": 0.779541015625, + "reward_std": 0.011461608111858368, + "rewards//mean": 0.779541015625, + "rewards//std": 0.024664821103215218, + "step": 3215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6432, + "grad_norm": 1.5396802425384521, + "kl": 1.268760645762086, + "learning_rate": 2.8797213292019924e-07, + "loss": 0.1269, + "num_tokens": 27794500.0, + "reward": 0.72930908203125, + "reward_std": 0.004782720468938351, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.02496548369526863, + "step": 3216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6434, + "grad_norm": 2.96240496635437, + "kl": 1.096575304865837, + "learning_rate": 2.8768478779350925e-07, + "loss": 0.1097, + "num_tokens": 27803164.0, + "reward": 0.77490234375, + "reward_std": 0.00632492545992136, + "rewards//mean": 0.77490234375, + "rewards//std": 0.019193509593605995, + "step": 3217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6436, + "grad_norm": 2.3168208599090576, + "kl": 1.455308098345995, + "learning_rate": 2.873975281873613e-07, + "loss": 0.1455, + "num_tokens": 27811780.0, + "reward": 0.7608642578125, + "reward_std": 0.010961771942675114, + "rewards//mean": 0.7608642578125, + "rewards//std": 0.03635500743985176, + "step": 3218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6438, + "grad_norm": 5.788711071014404, + "kl": 2.0260234605520964, + "learning_rate": 2.8711035421746363e-07, + "loss": 0.2026, + "num_tokens": 27820460.0, + "reward": 0.7518310546875, + "reward_std": 0.01078803837299347, + "rewards//mean": 0.7518310546875, + "rewards//std": 0.0340038537979126, + "step": 3219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.644, + "grad_norm": 1.0695339441299438, + "kl": 1.2334889862686396, + "learning_rate": 2.8682326599949e-07, + "loss": 0.1233, + "num_tokens": 27829092.0, + "reward": 0.7410888671875, + "reward_std": 0.005528006702661514, + "rewards//mean": 0.7410888671875, + "rewards//std": 0.03249972686171532, + "step": 3220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6442, + "grad_norm": 3.0594613552093506, + "kl": 0.9796166308224201, + "learning_rate": 2.8653626364907914e-07, + "loss": 0.098, + "num_tokens": 27837652.0, + "reward": 0.755126953125, + "reward_std": 0.007470668293535709, + "rewards//mean": 0.755126953125, + "rewards//std": 0.026665473356842995, + "step": 3221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6444, + "grad_norm": 4.423707485198975, + "kl": 1.2276590261608362, + "learning_rate": 2.862493472818357e-07, + "loss": 0.1228, + "num_tokens": 27846292.0, + "reward": 0.72747802734375, + "reward_std": 0.0041482714004814625, + "rewards//mean": 0.72747802734375, + "rewards//std": 0.02673683688044548, + "step": 3222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6446, + "grad_norm": 3.517082929611206, + "kl": 0.957864573225379, + "learning_rate": 2.859625170133297e-07, + "loss": 0.0958, + "num_tokens": 27854892.0, + "reward": 0.7637939453125, + "reward_std": 0.00795029103755951, + "rewards//mean": 0.7637939453125, + "rewards//std": 0.01902199164032936, + "step": 3223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6448, + "grad_norm": 2.4361133575439453, + "kl": 1.8025647979229689, + "learning_rate": 2.856757729590964e-07, + "loss": 0.1803, + "num_tokens": 27863452.0, + "reward": 0.74578857421875, + "reward_std": 0.016121895983815193, + "rewards//mean": 0.74578857421875, + "rewards//std": 0.029382973909378052, + "step": 3224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.645, + "grad_norm": 1.9291067123413086, + "kl": 0.829675642773509, + "learning_rate": 2.853891152346359e-07, + "loss": 0.083, + "num_tokens": 27872092.0, + "reward": 0.75445556640625, + "reward_std": 0.005589920096099377, + "rewards//mean": 0.75445556640625, + "rewards//std": 0.02086663991212845, + "step": 3225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6452, + "grad_norm": 2.8478314876556396, + "kl": 1.350809931755066, + "learning_rate": 2.8510254395541414e-07, + "loss": 0.1351, + "num_tokens": 27880732.0, + "reward": 0.74078369140625, + "reward_std": 0.0052598146721720695, + "rewards//mean": 0.74078369140625, + "rewards//std": 0.027271617203950882, + "step": 3226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6454, + "grad_norm": 2.021186351776123, + "kl": 1.0481289066374302, + "learning_rate": 2.8481605923686205e-07, + "loss": 0.1048, + "num_tokens": 27889300.0, + "reward": 0.80072021484375, + "reward_std": 0.011301252990961075, + "rewards//mean": 0.80072021484375, + "rewards//std": 0.023852277547121048, + "step": 3227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6456, + "grad_norm": 2.22578501701355, + "kl": 1.1010988503694534, + "learning_rate": 2.845296611943756e-07, + "loss": 0.1101, + "num_tokens": 27897964.0, + "reward": 0.7491455078125, + "reward_std": 0.00615822896361351, + "rewards//mean": 0.7491455078125, + "rewards//std": 0.02858208492398262, + "step": 3228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6458, + "grad_norm": 8.999972343444824, + "kl": 1.2499917037785053, + "learning_rate": 2.842433499433158e-07, + "loss": 0.125, + "num_tokens": 27906644.0, + "reward": 0.75872802734375, + "reward_std": 0.006714365445077419, + "rewards//mean": 0.75872802734375, + "rewards//std": 0.03297524154186249, + "step": 3229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.646, + "grad_norm": 2.8594837188720703, + "kl": 1.0000901874154806, + "learning_rate": 2.8395712559900874e-07, + "loss": 0.1, + "num_tokens": 27915292.0, + "reward": 0.7659912109375, + "reward_std": 0.00798336137086153, + "rewards//mean": 0.7659912109375, + "rewards//std": 0.02968161553144455, + "step": 3230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6462, + "grad_norm": 2.1313560009002686, + "kl": 1.5216837543994188, + "learning_rate": 2.8367098827674576e-07, + "loss": 0.1522, + "num_tokens": 27923988.0, + "reward": 0.7232666015625, + "reward_std": 0.007899959571659565, + "rewards//mean": 0.7232666015625, + "rewards//std": 0.035202506929636, + "step": 3231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6464, + "grad_norm": 2.0264391899108887, + "kl": 2.807942384853959, + "learning_rate": 2.83384938091783e-07, + "loss": 0.2808, + "num_tokens": 27932644.0, + "reward": 0.78759765625, + "reward_std": 0.020425546914339066, + "rewards//mean": 0.78759765625, + "rewards//std": 0.03553314134478569, + "step": 3232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6466, + "grad_norm": 1.6083614826202393, + "kl": 0.7944556940346956, + "learning_rate": 2.83098975159341e-07, + "loss": 0.0794, + "num_tokens": 27941204.0, + "reward": 0.74560546875, + "reward_std": 0.0021288893185555935, + "rewards//mean": 0.74560546875, + "rewards//std": 0.03081684559583664, + "step": 3233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6468, + "grad_norm": 3.406136989593506, + "kl": 1.338389240205288, + "learning_rate": 2.8281309959460595e-07, + "loss": 0.1338, + "num_tokens": 27949836.0, + "reward": 0.778076171875, + "reward_std": 0.007263952866196632, + "rewards//mean": 0.778076171875, + "rewards//std": 0.0242288988083601, + "step": 3234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.647, + "grad_norm": 9.926509857177734, + "kl": 1.5007369592785835, + "learning_rate": 2.825273115127286e-07, + "loss": 0.1501, + "num_tokens": 27958460.0, + "reward": 0.71697998046875, + "reward_std": 0.0026110606268048286, + "rewards//mean": 0.71697998046875, + "rewards//std": 0.027734503149986267, + "step": 3235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6472, + "grad_norm": 5.149123191833496, + "kl": 2.7244425676763058, + "learning_rate": 2.8224161102882393e-07, + "loss": 0.2724, + "num_tokens": 27967132.0, + "reward": 0.76092529296875, + "reward_std": 0.018170610070228577, + "rewards//mean": 0.76092529296875, + "rewards//std": 0.03647281229496002, + "step": 3236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6474, + "grad_norm": 1.6717604398727417, + "kl": 1.6383025869727135, + "learning_rate": 2.819559982579723e-07, + "loss": 0.1638, + "num_tokens": 27975756.0, + "reward": 0.71832275390625, + "reward_std": 0.010321813635528088, + "rewards//mean": 0.71832275390625, + "rewards//std": 0.03947032615542412, + "step": 3237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6476, + "grad_norm": 3.1110386848449707, + "kl": 2.3777016047388315, + "learning_rate": 2.8167047331521847e-07, + "loss": 0.2378, + "num_tokens": 27984460.0, + "reward": 0.77899169921875, + "reward_std": 0.015100955963134766, + "rewards//mean": 0.77899169921875, + "rewards//std": 0.03384561464190483, + "step": 3238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6478, + "grad_norm": 1.987202763557434, + "kl": 0.8220725562423468, + "learning_rate": 2.8138503631557213e-07, + "loss": 0.0822, + "num_tokens": 27993012.0, + "reward": 0.7869873046875, + "reward_std": 0.006649912800639868, + "rewards//mean": 0.7869873046875, + "rewards//std": 0.02630050666630268, + "step": 3239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.648, + "grad_norm": 4.776216506958008, + "kl": 2.261438911780715, + "learning_rate": 2.810996873740068e-07, + "loss": 0.2261, + "num_tokens": 28001748.0, + "reward": 0.7615966796875, + "reward_std": 0.015350927598774433, + "rewards//mean": 0.7615966796875, + "rewards//std": 0.03456898406147957, + "step": 3240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6482, + "grad_norm": 4.976326942443848, + "kl": 1.1828956119716167, + "learning_rate": 2.808144266054612e-07, + "loss": 0.1183, + "num_tokens": 28010300.0, + "reward": 0.7388916015625, + "reward_std": 0.007444972172379494, + "rewards//mean": 0.7388916015625, + "rewards//std": 0.03667835146188736, + "step": 3241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6484, + "grad_norm": 2.8397817611694336, + "kl": 1.8127122931182384, + "learning_rate": 2.805292541248384e-07, + "loss": 0.1813, + "num_tokens": 28018868.0, + "reward": 0.75213623046875, + "reward_std": 0.011681145057082176, + "rewards//mean": 0.75213623046875, + "rewards//std": 0.024408714845776558, + "step": 3242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6486, + "grad_norm": 2.3953850269317627, + "kl": 0.7334562726318836, + "learning_rate": 2.8024417004700595e-07, + "loss": 0.0733, + "num_tokens": 28027516.0, + "reward": 0.760009765625, + "reward_std": 0.005857095587998629, + "rewards//mean": 0.760009765625, + "rewards//std": 0.031295500695705414, + "step": 3243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6488, + "grad_norm": 6.899442672729492, + "kl": 1.769723767414689, + "learning_rate": 2.7995917448679534e-07, + "loss": 0.177, + "num_tokens": 28036172.0, + "reward": 0.76385498046875, + "reward_std": 0.004378088749945164, + "rewards//mean": 0.76385498046875, + "rewards//std": 0.021405935287475586, + "step": 3244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.649, + "grad_norm": 0.7946751713752747, + "kl": 0.734868923202157, + "learning_rate": 2.796742675590029e-07, + "loss": 0.0735, + "num_tokens": 28044732.0, + "reward": 0.74774169921875, + "reward_std": 0.003260610159486532, + "rewards//mean": 0.74774169921875, + "rewards//std": 0.021863535046577454, + "step": 3245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6492, + "grad_norm": 3.986832857131958, + "kl": 0.5981582198292017, + "learning_rate": 2.7938944937838923e-07, + "loss": 0.0598, + "num_tokens": 28053436.0, + "reward": 0.7777099609375, + "reward_std": 0.0032466966658830643, + "rewards//mean": 0.7777099609375, + "rewards//std": 0.018344644457101822, + "step": 3246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6494, + "grad_norm": 2.560429811477661, + "kl": 0.9547121785581112, + "learning_rate": 2.791047200596791e-07, + "loss": 0.0955, + "num_tokens": 28061996.0, + "reward": 0.747802734375, + "reward_std": 0.0060596526600420475, + "rewards//mean": 0.747802734375, + "rewards//std": 0.028003443032503128, + "step": 3247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6496, + "grad_norm": 1.2137703895568848, + "kl": 0.6001413427293301, + "learning_rate": 2.7882007971756113e-07, + "loss": 0.06, + "num_tokens": 28070612.0, + "reward": 0.75677490234375, + "reward_std": 0.0011931126937270164, + "rewards//mean": 0.75677490234375, + "rewards//std": 0.01989666000008583, + "step": 3248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6498, + "grad_norm": 2.034144163131714, + "kl": 1.1894249375909567, + "learning_rate": 2.785355284666886e-07, + "loss": 0.1189, + "num_tokens": 28079212.0, + "reward": 0.71783447265625, + "reward_std": 0.005483583547174931, + "rewards//mean": 0.71783447265625, + "rewards//std": 0.02804550714790821, + "step": 3249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.65, + "grad_norm": 1.1522042751312256, + "kl": 0.5814175475388765, + "learning_rate": 2.782510664216789e-07, + "loss": 0.0581, + "num_tokens": 28087868.0, + "reward": 0.74969482421875, + "reward_std": 0.0025895023718476295, + "rewards//mean": 0.74969482421875, + "rewards//std": 0.028976252302527428, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6502, + "grad_norm": 8.392403602600098, + "kl": 1.4181362595409155, + "learning_rate": 2.779666936971129e-07, + "loss": 0.1418, + "num_tokens": 28096580.0, + "reward": 0.774169921875, + "reward_std": 0.013251601718366146, + "rewards//mean": 0.774169921875, + "rewards//std": 0.028466660529375076, + "step": 3251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6504, + "grad_norm": 3.742032527923584, + "kl": 1.1989432126283646, + "learning_rate": 2.776824104075364e-07, + "loss": 0.1199, + "num_tokens": 28105188.0, + "reward": 0.74993896484375, + "reward_std": 0.007053102366626263, + "rewards//mean": 0.74993896484375, + "rewards//std": 0.027334822341799736, + "step": 3252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6506, + "grad_norm": 2.4214890003204346, + "kl": 1.0721677131950855, + "learning_rate": 2.7739821666745817e-07, + "loss": 0.1072, + "num_tokens": 28113828.0, + "reward": 0.750244140625, + "reward_std": 0.007989597506821156, + "rewards//mean": 0.750244140625, + "rewards//std": 0.023197297006845474, + "step": 3253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6508, + "grad_norm": 3.413898229598999, + "kl": 1.9741854146122932, + "learning_rate": 2.7711411259135167e-07, + "loss": 0.1974, + "num_tokens": 28122404.0, + "reward": 0.7701416015625, + "reward_std": 0.011807329952716827, + "rewards//mean": 0.7701416015625, + "rewards//std": 0.033553920686244965, + "step": 3254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.651, + "grad_norm": 2.8387491703033447, + "kl": 0.6691036820411682, + "learning_rate": 2.768300982936541e-07, + "loss": 0.0669, + "num_tokens": 28131052.0, + "reward": 0.7630615234375, + "reward_std": 0.004823337309062481, + "rewards//mean": 0.7630615234375, + "rewards//std": 0.028350230306386948, + "step": 3255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6512, + "grad_norm": 1.9150416851043701, + "kl": 1.2213480584323406, + "learning_rate": 2.765461738887661e-07, + "loss": 0.1221, + "num_tokens": 28139668.0, + "reward": 0.76385498046875, + "reward_std": 0.01021348312497139, + "rewards//mean": 0.76385498046875, + "rewards//std": 0.041380446404218674, + "step": 3256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6514, + "grad_norm": 1.4733084440231323, + "kl": 1.3550290986895561, + "learning_rate": 2.762623394910525e-07, + "loss": 0.1355, + "num_tokens": 28148284.0, + "reward": 0.79010009765625, + "reward_std": 0.007758957799524069, + "rewards//mean": 0.79010009765625, + "rewards//std": 0.02704755775630474, + "step": 3257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6516, + "grad_norm": 1.855746865272522, + "kl": 1.2006632331758738, + "learning_rate": 2.759785952148418e-07, + "loss": 0.1201, + "num_tokens": 28156892.0, + "reward": 0.77520751953125, + "reward_std": 0.008750910870730877, + "rewards//mean": 0.77520751953125, + "rewards//std": 0.026910090819001198, + "step": 3258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6518, + "grad_norm": 0.5695237517356873, + "kl": 0.4469396751374006, + "learning_rate": 2.7569494117442635e-07, + "loss": 0.0447, + "num_tokens": 28165548.0, + "reward": 0.72979736328125, + "reward_std": 0.0008631674572825432, + "rewards//mean": 0.72979736328125, + "rewards//std": 0.0218052975833416, + "step": 3259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.652, + "grad_norm": 1.660649299621582, + "kl": 0.8386955857276917, + "learning_rate": 2.754113774840616e-07, + "loss": 0.0839, + "num_tokens": 28174228.0, + "reward": 0.74444580078125, + "reward_std": 0.006042172200977802, + "rewards//mean": 0.74444580078125, + "rewards//std": 0.034201547503471375, + "step": 3260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6522, + "grad_norm": 1.8678624629974365, + "kl": 0.7810480613261461, + "learning_rate": 2.751279042579672e-07, + "loss": 0.0781, + "num_tokens": 28182820.0, + "reward": 0.73419189453125, + "reward_std": 0.002756381407380104, + "rewards//mean": 0.73419189453125, + "rewards//std": 0.025652125477790833, + "step": 3261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6524, + "grad_norm": 3.202864170074463, + "kl": 1.5966161284595728, + "learning_rate": 2.748445216103262e-07, + "loss": 0.1597, + "num_tokens": 28191316.0, + "reward": 0.75775146484375, + "reward_std": 0.006226460449397564, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.02637888304889202, + "step": 3262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6526, + "grad_norm": 2.659766674041748, + "kl": 2.1674531418830156, + "learning_rate": 2.745612296552847e-07, + "loss": 0.2167, + "num_tokens": 28199940.0, + "reward": 0.7447509765625, + "reward_std": 0.018480103462934494, + "rewards//mean": 0.7447509765625, + "rewards//std": 0.04030885547399521, + "step": 3263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6528, + "grad_norm": 3.772737979888916, + "kl": 1.2866682317107916, + "learning_rate": 2.74278028506953e-07, + "loss": 0.1287, + "num_tokens": 28208540.0, + "reward": 0.79901123046875, + "reward_std": 0.01174429152160883, + "rewards//mean": 0.79901123046875, + "rewards//std": 0.03489645570516586, + "step": 3264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.653, + "grad_norm": 2.7374162673950195, + "kl": 0.8429315276443958, + "learning_rate": 2.7399491827940444e-07, + "loss": 0.0843, + "num_tokens": 28217116.0, + "reward": 0.78277587890625, + "reward_std": 0.007184766698628664, + "rewards//mean": 0.78277587890625, + "rewards//std": 0.03184588998556137, + "step": 3265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6532, + "grad_norm": 4.006209850311279, + "kl": 1.6272545792162418, + "learning_rate": 2.73711899086676e-07, + "loss": 0.1627, + "num_tokens": 28225772.0, + "reward": 0.78326416015625, + "reward_std": 0.009059697389602661, + "rewards//mean": 0.78326416015625, + "rewards//std": 0.028998708352446556, + "step": 3266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6534, + "grad_norm": 4.762817859649658, + "kl": 2.7075423300266266, + "learning_rate": 2.734289710427673e-07, + "loss": 0.2708, + "num_tokens": 28234508.0, + "reward": 0.7542724609375, + "reward_std": 0.016216270625591278, + "rewards//mean": 0.7542724609375, + "rewards//std": 0.035185303539037704, + "step": 3267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6536, + "grad_norm": 12.894137382507324, + "kl": 2.6099146343767643, + "learning_rate": 2.73146134261642e-07, + "loss": 0.261, + "num_tokens": 28243244.0, + "reward": 0.76727294921875, + "reward_std": 0.015093198046088219, + "rewards//mean": 0.76727294921875, + "rewards//std": 0.04670316353440285, + "step": 3268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6538, + "grad_norm": 1.8125495910644531, + "kl": 0.8923916034400463, + "learning_rate": 2.728633888572267e-07, + "loss": 0.0892, + "num_tokens": 28251996.0, + "reward": 0.754150390625, + "reward_std": 0.0054050348699092865, + "rewards//mean": 0.754150390625, + "rewards//std": 0.0324803926050663, + "step": 3269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.654, + "grad_norm": 6.312792778015137, + "kl": 1.6264133844524622, + "learning_rate": 2.7258073494341136e-07, + "loss": 0.1626, + "num_tokens": 28260684.0, + "reward": 0.77276611328125, + "reward_std": 0.01061801053583622, + "rewards//mean": 0.77276611328125, + "rewards//std": 0.03598730266094208, + "step": 3270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6542, + "grad_norm": 2.006842851638794, + "kl": 0.7947744447737932, + "learning_rate": 2.7229817263404864e-07, + "loss": 0.0795, + "num_tokens": 28269292.0, + "reward": 0.7525634765625, + "reward_std": 0.0025721616111695766, + "rewards//mean": 0.7525634765625, + "rewards//std": 0.029775308445096016, + "step": 3271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6544, + "grad_norm": 4.051008701324463, + "kl": 1.746843095868826, + "learning_rate": 2.720157020429547e-07, + "loss": 0.1747, + "num_tokens": 28277860.0, + "reward": 0.7479248046875, + "reward_std": 0.007265838794410229, + "rewards//mean": 0.7479248046875, + "rewards//std": 0.02286173589527607, + "step": 3272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6546, + "grad_norm": 0.7582545876502991, + "kl": 0.6488897278904915, + "learning_rate": 2.7173332328390876e-07, + "loss": 0.0649, + "num_tokens": 28286452.0, + "reward": 0.74951171875, + "reward_std": 0.0006905339541845024, + "rewards//mean": 0.74951171875, + "rewards//std": 0.020522939041256905, + "step": 3273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6548, + "grad_norm": 1.305607557296753, + "kl": 1.1010812073946, + "learning_rate": 2.71451036470653e-07, + "loss": 0.1101, + "num_tokens": 28295140.0, + "reward": 0.77703857421875, + "reward_std": 0.007869764231145382, + "rewards//mean": 0.77703857421875, + "rewards//std": 0.03147630766034126, + "step": 3274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.655, + "grad_norm": 5.374042510986328, + "kl": 1.2996437530964613, + "learning_rate": 2.7116884171689236e-07, + "loss": 0.13, + "num_tokens": 28303748.0, + "reward": 0.78167724609375, + "reward_std": 0.012930039316415787, + "rewards//mean": 0.78167724609375, + "rewards//std": 0.02934843674302101, + "step": 3275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6552, + "grad_norm": 2.1267409324645996, + "kl": 1.4509250316768885, + "learning_rate": 2.708867391362948e-07, + "loss": 0.1451, + "num_tokens": 28312348.0, + "reward": 0.752197265625, + "reward_std": 0.011367673054337502, + "rewards//mean": 0.752197265625, + "rewards//std": 0.02246531844139099, + "step": 3276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6554, + "grad_norm": 14.103446960449219, + "kl": 2.4639233518391848, + "learning_rate": 2.706047288424914e-07, + "loss": 0.2464, + "num_tokens": 28320996.0, + "reward": 0.728759765625, + "reward_std": 0.00915514025837183, + "rewards//mean": 0.728759765625, + "rewards//std": 0.03982925042510033, + "step": 3277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6556, + "grad_norm": 4.125787258148193, + "kl": 2.1275758165866137, + "learning_rate": 2.7032281094907594e-07, + "loss": 0.2128, + "num_tokens": 28329636.0, + "reward": 0.75531005859375, + "reward_std": 0.014707660302519798, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.026524242013692856, + "step": 3278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6558, + "grad_norm": 3.321157217025757, + "kl": 1.1565856896340847, + "learning_rate": 2.7004098556960454e-07, + "loss": 0.1157, + "num_tokens": 28338268.0, + "reward": 0.74853515625, + "reward_std": 0.012429611757397652, + "rewards//mean": 0.74853515625, + "rewards//std": 0.035512689501047134, + "step": 3279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.656, + "grad_norm": 11.309271812438965, + "kl": 2.6311013605445623, + "learning_rate": 2.697592528175967e-07, + "loss": 0.2631, + "num_tokens": 28346972.0, + "reward": 0.74200439453125, + "reward_std": 0.012479919008910656, + "rewards//mean": 0.74200439453125, + "rewards//std": 0.041721444576978683, + "step": 3280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6562, + "grad_norm": 11.59035587310791, + "kl": 2.4932159520685673, + "learning_rate": 2.6947761280653447e-07, + "loss": 0.2493, + "num_tokens": 28355668.0, + "reward": 0.7755126953125, + "reward_std": 0.00915472861379385, + "rewards//mean": 0.7755126953125, + "rewards//std": 0.02559104934334755, + "step": 3281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6564, + "grad_norm": 2.1180267333984375, + "kl": 1.0487906597554684, + "learning_rate": 2.6919606564986207e-07, + "loss": 0.1049, + "num_tokens": 28364252.0, + "reward": 0.7713623046875, + "reward_std": 0.005869538523256779, + "rewards//mean": 0.7713623046875, + "rewards//std": 0.029256286099553108, + "step": 3282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6566, + "grad_norm": 0.7472384572029114, + "kl": 0.6668932177126408, + "learning_rate": 2.6891461146098676e-07, + "loss": 0.0667, + "num_tokens": 28372812.0, + "reward": 0.77130126953125, + "reward_std": 0.0035410337150096893, + "rewards//mean": 0.77130126953125, + "rewards//std": 0.019807443022727966, + "step": 3283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6568, + "grad_norm": 7.553743839263916, + "kl": 0.9374720323830843, + "learning_rate": 2.686332503532783e-07, + "loss": 0.0937, + "num_tokens": 28381404.0, + "reward": 0.73565673828125, + "reward_std": 0.004164278507232666, + "rewards//mean": 0.73565673828125, + "rewards//std": 0.03245189040899277, + "step": 3284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.657, + "grad_norm": 1.1623544692993164, + "kl": 0.604723010212183, + "learning_rate": 2.683519824400692e-07, + "loss": 0.0605, + "num_tokens": 28390060.0, + "reward": 0.717529296875, + "reward_std": 0.0024168689269572496, + "rewards//mean": 0.717529296875, + "rewards//std": 0.03239826112985611, + "step": 3285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6572, + "grad_norm": 14.515250205993652, + "kl": 3.003290781751275, + "learning_rate": 2.680708078346537e-07, + "loss": 0.3003, + "num_tokens": 28398804.0, + "reward": 0.76251220703125, + "reward_std": 0.018274033442139626, + "rewards//mean": 0.76251220703125, + "rewards//std": 0.03788004443049431, + "step": 3286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6574, + "grad_norm": 2.355455160140991, + "kl": 1.1619539093226194, + "learning_rate": 2.6778972665028906e-07, + "loss": 0.1162, + "num_tokens": 28407388.0, + "reward": 0.737060546875, + "reward_std": 0.007963785901665688, + "rewards//mean": 0.737060546875, + "rewards//std": 0.01873856410384178, + "step": 3287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6576, + "grad_norm": 7.6698737144470215, + "kl": 2.005937883630395, + "learning_rate": 2.675087390001947e-07, + "loss": 0.2006, + "num_tokens": 28416076.0, + "reward": 0.7314453125, + "reward_std": 0.009930286556482315, + "rewards//mean": 0.7314453125, + "rewards//std": 0.03109460510313511, + "step": 3288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6578, + "grad_norm": 1.4207866191864014, + "kl": 1.0361197870224714, + "learning_rate": 2.6722784499755267e-07, + "loss": 0.1036, + "num_tokens": 28424780.0, + "reward": 0.7547607421875, + "reward_std": 0.004800085909664631, + "rewards//mean": 0.7547607421875, + "rewards//std": 0.030564062297344208, + "step": 3289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.658, + "grad_norm": 2.120121479034424, + "kl": 0.7763578072190285, + "learning_rate": 2.6694704475550666e-07, + "loss": 0.0776, + "num_tokens": 28433412.0, + "reward": 0.76654052734375, + "reward_std": 0.004520035814493895, + "rewards//mean": 0.76654052734375, + "rewards//std": 0.022355807945132256, + "step": 3290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6582, + "grad_norm": 8.612874984741211, + "kl": 2.55491279438138, + "learning_rate": 2.6666633838716314e-07, + "loss": 0.2555, + "num_tokens": 28442124.0, + "reward": 0.78509521484375, + "reward_std": 0.010530218482017517, + "rewards//mean": 0.78509521484375, + "rewards//std": 0.029524309560656548, + "step": 3291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6584, + "grad_norm": 0.31139737367630005, + "kl": 0.44030685164034367, + "learning_rate": 2.6638572600559063e-07, + "loss": 0.044, + "num_tokens": 28450780.0, + "reward": 0.75372314453125, + "reward_std": 0.0006596610764972866, + "rewards//mean": 0.75372314453125, + "rewards//std": 0.025319505482912064, + "step": 3292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6586, + "grad_norm": 5.030651092529297, + "kl": 2.0082827117294073, + "learning_rate": 2.6610520772381996e-07, + "loss": 0.2008, + "num_tokens": 28459524.0, + "reward": 0.746337890625, + "reward_std": 0.010198493488132954, + "rewards//mean": 0.746337890625, + "rewards//std": 0.03616401180624962, + "step": 3293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6588, + "grad_norm": 2.9165425300598145, + "kl": 0.8357100766152143, + "learning_rate": 2.658247836548434e-07, + "loss": 0.0836, + "num_tokens": 28468156.0, + "reward": 0.75048828125, + "reward_std": 0.0030966063495725393, + "rewards//mean": 0.75048828125, + "rewards//std": 0.0238015316426754, + "step": 3294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.659, + "grad_norm": 3.2285470962524414, + "kl": 1.6311288066208363, + "learning_rate": 2.65544453911616e-07, + "loss": 0.1631, + "num_tokens": 28476748.0, + "reward": 0.75091552734375, + "reward_std": 0.007480588275939226, + "rewards//mean": 0.75091552734375, + "rewards//std": 0.030399372801184654, + "step": 3295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6592, + "grad_norm": 2.85044527053833, + "kl": 1.774171121418476, + "learning_rate": 2.6526421860705473e-07, + "loss": 0.1774, + "num_tokens": 28485412.0, + "reward": 0.74627685546875, + "reward_std": 0.007223515771329403, + "rewards//mean": 0.74627685546875, + "rewards//std": 0.028352031484246254, + "step": 3296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6594, + "grad_norm": 1.535612940788269, + "kl": 0.8223083559423685, + "learning_rate": 2.649840778540379e-07, + "loss": 0.0822, + "num_tokens": 28494060.0, + "reward": 0.74835205078125, + "reward_std": 0.006122248247265816, + "rewards//mean": 0.74835205078125, + "rewards//std": 0.031085291877388954, + "step": 3297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6596, + "grad_norm": 11.137580871582031, + "kl": 2.384360386058688, + "learning_rate": 2.6470403176540644e-07, + "loss": 0.2384, + "num_tokens": 28502708.0, + "reward": 0.74310302734375, + "reward_std": 0.008183266967535019, + "rewards//mean": 0.74310302734375, + "rewards//std": 0.02882908098399639, + "step": 3298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6598, + "grad_norm": 7.293783664703369, + "kl": 1.7389510218054056, + "learning_rate": 2.644240804539629e-07, + "loss": 0.1739, + "num_tokens": 28511340.0, + "reward": 0.76263427734375, + "reward_std": 0.003514254931360483, + "rewards//mean": 0.76263427734375, + "rewards//std": 0.030534517019987106, + "step": 3299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.66, + "grad_norm": 4.593398571014404, + "kl": 1.1114847995340824, + "learning_rate": 2.641442240324717e-07, + "loss": 0.1111, + "num_tokens": 28519956.0, + "reward": 0.7991943359375, + "reward_std": 0.008348455652594566, + "rewards//mean": 0.7991943359375, + "rewards//std": 0.02495615929365158, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6602, + "grad_norm": 5.031010627746582, + "kl": 1.0714304354041815, + "learning_rate": 2.638644626136587e-07, + "loss": 0.1071, + "num_tokens": 28528548.0, + "reward": 0.75653076171875, + "reward_std": 0.004375926218926907, + "rewards//mean": 0.75653076171875, + "rewards//std": 0.0163529384881258, + "step": 3301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6604, + "grad_norm": 0.43760189414024353, + "kl": 0.44404924288392067, + "learning_rate": 2.635847963102119e-07, + "loss": 0.0444, + "num_tokens": 28537116.0, + "reward": 0.73150634765625, + "reward_std": 0.0001726334885461256, + "rewards//mean": 0.73150634765625, + "rewards//std": 0.029283374547958374, + "step": 3302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6606, + "grad_norm": 2.851663589477539, + "kl": 1.7404516078531742, + "learning_rate": 2.6330522523478084e-07, + "loss": 0.174, + "num_tokens": 28545812.0, + "reward": 0.7552490234375, + "reward_std": 0.013324678875505924, + "rewards//mean": 0.7552490234375, + "rewards//std": 0.03480812534689903, + "step": 3303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6608, + "grad_norm": 2.114204168319702, + "kl": 2.7939234878867865, + "learning_rate": 2.63025749499977e-07, + "loss": 0.2794, + "num_tokens": 28554516.0, + "reward": 0.7535400390625, + "reward_std": 0.023100275546312332, + "rewards//mean": 0.7535400390625, + "rewards//std": 0.03595474362373352, + "step": 3304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.661, + "grad_norm": 1.5483964681625366, + "kl": 0.8551100734621286, + "learning_rate": 2.6274636921837267e-07, + "loss": 0.0855, + "num_tokens": 28563092.0, + "reward": 0.76214599609375, + "reward_std": 0.00523786386474967, + "rewards//mean": 0.76214599609375, + "rewards//std": 0.027144765481352806, + "step": 3305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6612, + "grad_norm": 1.5940011739730835, + "kl": 1.5123450588434935, + "learning_rate": 2.6246708450250256e-07, + "loss": 0.1512, + "num_tokens": 28571732.0, + "reward": 0.81268310546875, + "reward_std": 0.011424384079873562, + "rewards//mean": 0.81268310546875, + "rewards//std": 0.02805144339799881, + "step": 3306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6614, + "grad_norm": 1.7513694763183594, + "kl": 1.8115441016852856, + "learning_rate": 2.621878954648623e-07, + "loss": 0.1812, + "num_tokens": 28580292.0, + "reward": 0.77801513671875, + "reward_std": 0.012671878561377525, + "rewards//mean": 0.77801513671875, + "rewards//std": 0.030085530132055283, + "step": 3307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6616, + "grad_norm": 6.927548885345459, + "kl": 1.4087862968444824, + "learning_rate": 2.6190880221790954e-07, + "loss": 0.1409, + "num_tokens": 28588900.0, + "reward": 0.75537109375, + "reward_std": 0.012232872657477856, + "rewards//mean": 0.75537109375, + "rewards//std": 0.0288690235465765, + "step": 3308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6618, + "grad_norm": 2.2099010944366455, + "kl": 1.2589291390031576, + "learning_rate": 2.6162980487406253e-07, + "loss": 0.1259, + "num_tokens": 28597476.0, + "reward": 0.75958251953125, + "reward_std": 0.007502012420445681, + "rewards//mean": 0.75958251953125, + "rewards//std": 0.0342378206551075, + "step": 3309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.662, + "grad_norm": 4.658015727996826, + "kl": 0.6481737792491913, + "learning_rate": 2.6135090354570165e-07, + "loss": 0.0648, + "num_tokens": 28606044.0, + "reward": 0.737548828125, + "reward_std": 0.0032974397763609886, + "rewards//mean": 0.737548828125, + "rewards//std": 0.03385327383875847, + "step": 3310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6622, + "grad_norm": 9.252667427062988, + "kl": 2.4124185610562563, + "learning_rate": 2.610720983451685e-07, + "loss": 0.2412, + "num_tokens": 28614700.0, + "reward": 0.75457763671875, + "reward_std": 0.010173780843615532, + "rewards//mean": 0.75457763671875, + "rewards//std": 0.0362488329410553, + "step": 3311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6624, + "grad_norm": 1.9169588088989258, + "kl": 1.345984298735857, + "learning_rate": 2.6079338938476536e-07, + "loss": 0.1346, + "num_tokens": 28623324.0, + "reward": 0.736572265625, + "reward_std": 0.005363935604691505, + "rewards//mean": 0.736572265625, + "rewards//std": 0.031124772503972054, + "step": 3312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6626, + "grad_norm": 1.6948258876800537, + "kl": 1.0349956154823303, + "learning_rate": 2.605147767767564e-07, + "loss": 0.1035, + "num_tokens": 28631940.0, + "reward": 0.75592041015625, + "reward_std": 0.007478573825210333, + "rewards//mean": 0.75592041015625, + "rewards//std": 0.020960737019777298, + "step": 3313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6628, + "grad_norm": 2.9499571323394775, + "kl": 1.3650176785886288, + "learning_rate": 2.6023626063336665e-07, + "loss": 0.1365, + "num_tokens": 28640580.0, + "reward": 0.73236083984375, + "reward_std": 0.007084453944116831, + "rewards//mean": 0.73236083984375, + "rewards//std": 0.03197018802165985, + "step": 3314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.663, + "grad_norm": 1.5131349563598633, + "kl": 0.9543469343334436, + "learning_rate": 2.5995784106678263e-07, + "loss": 0.0954, + "num_tokens": 28649252.0, + "reward": 0.7486572265625, + "reward_std": 0.0036433290224522352, + "rewards//mean": 0.7486572265625, + "rewards//std": 0.024941598996520042, + "step": 3315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6632, + "grad_norm": 1.741931676864624, + "kl": 0.8016622383147478, + "learning_rate": 2.5967951818915136e-07, + "loss": 0.0802, + "num_tokens": 28657884.0, + "reward": 0.748779296875, + "reward_std": 0.005971398204565048, + "rewards//mean": 0.748779296875, + "rewards//std": 0.03121025487780571, + "step": 3316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6634, + "grad_norm": 3.6230530738830566, + "kl": 1.420137545093894, + "learning_rate": 2.5940129211258146e-07, + "loss": 0.142, + "num_tokens": 28666572.0, + "reward": 0.76788330078125, + "reward_std": 0.009566002525389194, + "rewards//mean": 0.76788330078125, + "rewards//std": 0.03284322842955589, + "step": 3317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6636, + "grad_norm": 1.104433298110962, + "kl": 0.6210757419466972, + "learning_rate": 2.591231629491423e-07, + "loss": 0.0621, + "num_tokens": 28675172.0, + "reward": 0.7684326171875, + "reward_std": 0.0037668875884264708, + "rewards//mean": 0.7684326171875, + "rewards//std": 0.02574673853814602, + "step": 3318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6638, + "grad_norm": 1.7020106315612793, + "kl": 1.2296782620251179, + "learning_rate": 2.5884513081086446e-07, + "loss": 0.123, + "num_tokens": 28683724.0, + "reward": 0.7686767578125, + "reward_std": 0.007383274380117655, + "rewards//mean": 0.7686767578125, + "rewards//std": 0.0222768671810627, + "step": 3319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.664, + "grad_norm": 10.09396743774414, + "kl": 2.0072640515863895, + "learning_rate": 2.585671958097389e-07, + "loss": 0.2007, + "num_tokens": 28692508.0, + "reward": 0.759033203125, + "reward_std": 0.0122595289722085, + "rewards//mean": 0.759033203125, + "rewards//std": 0.03090611845254898, + "step": 3320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6642, + "grad_norm": 2.2549920082092285, + "kl": 1.984058128669858, + "learning_rate": 2.58289358057718e-07, + "loss": 0.1984, + "num_tokens": 28701156.0, + "reward": 0.7615966796875, + "reward_std": 0.010478060692548752, + "rewards//mean": 0.7615966796875, + "rewards//std": 0.033042941242456436, + "step": 3321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6644, + "grad_norm": 1.1082855463027954, + "kl": 0.9334073290228844, + "learning_rate": 2.5801161766671483e-07, + "loss": 0.0933, + "num_tokens": 28709756.0, + "reward": 0.77325439453125, + "reward_std": 0.006578098516911268, + "rewards//mean": 0.77325439453125, + "rewards//std": 0.02818925492465496, + "step": 3322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6646, + "grad_norm": 5.354910850524902, + "kl": 0.9304297156631947, + "learning_rate": 2.5773397474860325e-07, + "loss": 0.093, + "num_tokens": 28718532.0, + "reward": 0.7705078125, + "reward_std": 0.004286447074264288, + "rewards//mean": 0.7705078125, + "rewards//std": 0.03653133660554886, + "step": 3323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6648, + "grad_norm": 5.0557122230529785, + "kl": 1.6059030257165432, + "learning_rate": 2.574564294152175e-07, + "loss": 0.1606, + "num_tokens": 28727132.0, + "reward": 0.76556396484375, + "reward_std": 0.008403629064559937, + "rewards//mean": 0.76556396484375, + "rewards//std": 0.024165010079741478, + "step": 3324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.665, + "grad_norm": 7.176552772521973, + "kl": 2.3600514624267817, + "learning_rate": 2.5717898177835296e-07, + "loss": 0.236, + "num_tokens": 28735772.0, + "reward": 0.74530029296875, + "reward_std": 0.014232851564884186, + "rewards//mean": 0.74530029296875, + "rewards//std": 0.0316135510802269, + "step": 3325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6652, + "grad_norm": 9.006481170654297, + "kl": 1.922513511031866, + "learning_rate": 2.5690163194976573e-07, + "loss": 0.1923, + "num_tokens": 28744412.0, + "reward": 0.74383544921875, + "reward_std": 0.006600453983992338, + "rewards//mean": 0.74383544921875, + "rewards//std": 0.03738204017281532, + "step": 3326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6654, + "grad_norm": 2.335500717163086, + "kl": 1.1347531117498875, + "learning_rate": 2.566243800411719e-07, + "loss": 0.1135, + "num_tokens": 28753020.0, + "reward": 0.768310546875, + "reward_std": 0.0043824221938848495, + "rewards//mean": 0.768310546875, + "rewards//std": 0.021919630467891693, + "step": 3327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6656, + "grad_norm": 1.778923749923706, + "kl": 0.8382088672369719, + "learning_rate": 2.563472261642486e-07, + "loss": 0.0838, + "num_tokens": 28761652.0, + "reward": 0.75665283203125, + "reward_std": 0.0037696112412959337, + "rewards//mean": 0.75665283203125, + "rewards//std": 0.025739314034581184, + "step": 3328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6658, + "grad_norm": 3.2307591438293457, + "kl": 1.5740308947861195, + "learning_rate": 2.5607017043063353e-07, + "loss": 0.1574, + "num_tokens": 28770340.0, + "reward": 0.7528076171875, + "reward_std": 0.011582694016397, + "rewards//mean": 0.7528076171875, + "rewards//std": 0.026842854917049408, + "step": 3329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.666, + "grad_norm": 1.7276089191436768, + "kl": 1.263618377968669, + "learning_rate": 2.557932129519249e-07, + "loss": 0.1264, + "num_tokens": 28778972.0, + "reward": 0.7332763671875, + "reward_std": 0.00414077565073967, + "rewards//mean": 0.7332763671875, + "rewards//std": 0.028232516720891, + "step": 3330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6662, + "grad_norm": 1.402947187423706, + "kl": 0.9537486881017685, + "learning_rate": 2.555163538396806e-07, + "loss": 0.0954, + "num_tokens": 28787604.0, + "reward": 0.7432861328125, + "reward_std": 0.005531432572752237, + "rewards//mean": 0.7432861328125, + "rewards//std": 0.029947664588689804, + "step": 3331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6664, + "grad_norm": 1.0045316219329834, + "kl": 1.0441008415073156, + "learning_rate": 2.552395932054198e-07, + "loss": 0.1044, + "num_tokens": 28796300.0, + "reward": 0.76348876953125, + "reward_std": 0.004225175827741623, + "rewards//mean": 0.76348876953125, + "rewards//std": 0.025704003870487213, + "step": 3332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6666, + "grad_norm": 1.6437220573425293, + "kl": 0.822501178830862, + "learning_rate": 2.5496293116062153e-07, + "loss": 0.0823, + "num_tokens": 28804860.0, + "reward": 0.7716064453125, + "reward_std": 0.0028685256838798523, + "rewards//mean": 0.7716064453125, + "rewards//std": 0.03519390523433685, + "step": 3333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6668, + "grad_norm": 4.6441969871521, + "kl": 2.054376570507884, + "learning_rate": 2.546863678167255e-07, + "loss": 0.2054, + "num_tokens": 28813540.0, + "reward": 0.7342529296875, + "reward_std": 0.009032205678522587, + "rewards//mean": 0.7342529296875, + "rewards//std": 0.028176698833703995, + "step": 3334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.667, + "grad_norm": 1.2166770696640015, + "kl": 0.59235705062747, + "learning_rate": 2.5440990328513096e-07, + "loss": 0.0592, + "num_tokens": 28822076.0, + "reward": 0.7822265625, + "reward_std": 0.0011643373873084784, + "rewards//mean": 0.7822265625, + "rewards//std": 0.025055741891264915, + "step": 3335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6672, + "grad_norm": 1.2454607486724854, + "kl": 0.9859358686953783, + "learning_rate": 2.54133537677198e-07, + "loss": 0.0986, + "num_tokens": 28830684.0, + "reward": 0.72796630859375, + "reward_std": 0.00489396508783102, + "rewards//mean": 0.72796630859375, + "rewards//std": 0.03494110703468323, + "step": 3336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6674, + "grad_norm": 1.017585039138794, + "kl": 0.6173009965568781, + "learning_rate": 2.538572711042469e-07, + "loss": 0.0617, + "num_tokens": 28839284.0, + "reward": 0.73614501953125, + "reward_std": 0.002140125259757042, + "rewards//mean": 0.73614501953125, + "rewards//std": 0.01655716635286808, + "step": 3337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6676, + "grad_norm": 2.1403279304504395, + "kl": 1.0581932496279478, + "learning_rate": 2.535811036775574e-07, + "loss": 0.1058, + "num_tokens": 28848012.0, + "reward": 0.767333984375, + "reward_std": 0.0061898427084088326, + "rewards//mean": 0.767333984375, + "rewards//std": 0.030503826215863228, + "step": 3338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6678, + "grad_norm": 1.3511333465576172, + "kl": 1.3425128255039454, + "learning_rate": 2.5330503550837004e-07, + "loss": 0.1343, + "num_tokens": 28856756.0, + "reward": 0.72991943359375, + "reward_std": 0.008520022965967655, + "rewards//mean": 0.72991943359375, + "rewards//std": 0.03304219990968704, + "step": 3339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.668, + "grad_norm": 2.8398826122283936, + "kl": 0.9562410432845354, + "learning_rate": 2.530290667078846e-07, + "loss": 0.0956, + "num_tokens": 28865524.0, + "reward": 0.76300048828125, + "reward_std": 0.004682873375713825, + "rewards//mean": 0.76300048828125, + "rewards//std": 0.02088041976094246, + "step": 3340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6682, + "grad_norm": 2.4122958183288574, + "kl": 1.7735716942697763, + "learning_rate": 2.5275319738726165e-07, + "loss": 0.1774, + "num_tokens": 28874108.0, + "reward": 0.741943359375, + "reward_std": 0.017024485394358635, + "rewards//mean": 0.741943359375, + "rewards//std": 0.03245054930448532, + "step": 3341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6684, + "grad_norm": 4.193367958068848, + "kl": 1.0978987105190754, + "learning_rate": 2.524774276576214e-07, + "loss": 0.1098, + "num_tokens": 28882668.0, + "reward": 0.75970458984375, + "reward_std": 0.010839719325304031, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.03238418325781822, + "step": 3342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6686, + "grad_norm": 4.227565765380859, + "kl": 1.3143008463084698, + "learning_rate": 2.522017576300434e-07, + "loss": 0.1314, + "num_tokens": 28891332.0, + "reward": 0.7681884765625, + "reward_std": 0.004499755799770355, + "rewards//mean": 0.7681884765625, + "rewards//std": 0.01888461783528328, + "step": 3343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6688, + "grad_norm": 5.127817153930664, + "kl": 1.2840922363102436, + "learning_rate": 2.519261874155679e-07, + "loss": 0.1284, + "num_tokens": 28899884.0, + "reward": 0.73956298828125, + "reward_std": 0.006365499459207058, + "rewards//mean": 0.73956298828125, + "rewards//std": 0.031110605224967003, + "step": 3344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.669, + "grad_norm": 1.283121943473816, + "kl": 0.7308017704635859, + "learning_rate": 2.5165071712519445e-07, + "loss": 0.0731, + "num_tokens": 28908484.0, + "reward": 0.76605224609375, + "reward_std": 0.0031384890899062157, + "rewards//mean": 0.76605224609375, + "rewards//std": 0.0207997914403677, + "step": 3345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6692, + "grad_norm": 2.849968194961548, + "kl": 1.4851772841066122, + "learning_rate": 2.513753468698826e-07, + "loss": 0.1485, + "num_tokens": 28917300.0, + "reward": 0.74298095703125, + "reward_std": 0.009423274546861649, + "rewards//mean": 0.74298095703125, + "rewards//std": 0.024011043831706047, + "step": 3346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6694, + "grad_norm": 0.6154866218566895, + "kl": 0.7080034501850605, + "learning_rate": 2.5110007676055107e-07, + "loss": 0.0708, + "num_tokens": 28925996.0, + "reward": 0.77392578125, + "reward_std": 0.004223279654979706, + "rewards//mean": 0.77392578125, + "rewards//std": 0.0156327486038208, + "step": 3347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6696, + "grad_norm": 3.1175663471221924, + "kl": 2.064055424183607, + "learning_rate": 2.508249069080789e-07, + "loss": 0.2064, + "num_tokens": 28934660.0, + "reward": 0.7691650390625, + "reward_std": 0.01608796790242195, + "rewards//mean": 0.7691650390625, + "rewards//std": 0.03458124026656151, + "step": 3348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6698, + "grad_norm": 3.6027255058288574, + "kl": 1.0180056914687157, + "learning_rate": 2.5054983742330437e-07, + "loss": 0.1018, + "num_tokens": 28943300.0, + "reward": 0.7601318359375, + "reward_std": 0.005631064996123314, + "rewards//mean": 0.7601318359375, + "rewards//std": 0.03005463443696499, + "step": 3349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.67, + "grad_norm": 0.7520749568939209, + "kl": 0.8603554703295231, + "learning_rate": 2.5027486841702577e-07, + "loss": 0.086, + "num_tokens": 28951956.0, + "reward": 0.73931884765625, + "reward_std": 0.00388792110607028, + "rewards//mean": 0.73931884765625, + "rewards//std": 0.02528899535536766, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6702, + "grad_norm": 3.4540960788726807, + "kl": 1.1540595144033432, + "learning_rate": 2.500000000000001e-07, + "loss": 0.1154, + "num_tokens": 28960556.0, + "reward": 0.778564453125, + "reward_std": 0.008547404780983925, + "rewards//mean": 0.778564453125, + "rewards//std": 0.025561751797795296, + "step": 3351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6704, + "grad_norm": 2.3329193592071533, + "kl": 1.5313286781311035, + "learning_rate": 2.497252322829445e-07, + "loss": 0.1531, + "num_tokens": 28969260.0, + "reward": 0.7724609375, + "reward_std": 0.007602051831781864, + "rewards//mean": 0.7724609375, + "rewards//std": 0.036471616476774216, + "step": 3352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6706, + "grad_norm": 11.420098304748535, + "kl": 2.77924239076674, + "learning_rate": 2.494505653765354e-07, + "loss": 0.2779, + "num_tokens": 28977828.0, + "reward": 0.77276611328125, + "reward_std": 0.00805868674069643, + "rewards//mean": 0.77276611328125, + "rewards//std": 0.03845985233783722, + "step": 3353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6708, + "grad_norm": 0.9464414715766907, + "kl": 0.6240844558924437, + "learning_rate": 2.491759993914088e-07, + "loss": 0.0624, + "num_tokens": 28986508.0, + "reward": 0.8028564453125, + "reward_std": 0.0031074027065187693, + "rewards//mean": 0.8028564453125, + "rewards//std": 0.017538035288453102, + "step": 3354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.671, + "grad_norm": 1.7805505990982056, + "kl": 1.079855140298605, + "learning_rate": 2.489015344381595e-07, + "loss": 0.108, + "num_tokens": 28995140.0, + "reward": 0.7244873046875, + "reward_std": 0.00747663713991642, + "rewards//mean": 0.7244873046875, + "rewards//std": 0.03945713862776756, + "step": 3355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6712, + "grad_norm": 1.4576828479766846, + "kl": 0.8484917916357517, + "learning_rate": 2.4862717062734206e-07, + "loss": 0.0848, + "num_tokens": 29003708.0, + "reward": 0.75714111328125, + "reward_std": 0.006578098516911268, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.03169293701648712, + "step": 3356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6714, + "grad_norm": 0.9185293316841125, + "kl": 1.0268923249095678, + "learning_rate": 2.4835290806947045e-07, + "loss": 0.1027, + "num_tokens": 29012324.0, + "reward": 0.75372314453125, + "reward_std": 0.0031976401805877686, + "rewards//mean": 0.75372314453125, + "rewards//std": 0.0251467302441597, + "step": 3357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6716, + "grad_norm": 2.312732696533203, + "kl": 1.874905502423644, + "learning_rate": 2.4807874687501715e-07, + "loss": 0.1875, + "num_tokens": 29020940.0, + "reward": 0.7342529296875, + "reward_std": 0.007835041731595993, + "rewards//mean": 0.7342529296875, + "rewards//std": 0.029248006641864777, + "step": 3358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6718, + "grad_norm": 0.9682368040084839, + "kl": 1.04834808036685, + "learning_rate": 2.4780468715441457e-07, + "loss": 0.1048, + "num_tokens": 29029572.0, + "reward": 0.7850341796875, + "reward_std": 0.007691686507314444, + "rewards//mean": 0.7850341796875, + "rewards//std": 0.02663906291127205, + "step": 3359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.672, + "grad_norm": 1.674652338027954, + "kl": 1.4918945170938969, + "learning_rate": 2.4753072901805376e-07, + "loss": 0.1492, + "num_tokens": 29038164.0, + "reward": 0.7520751953125, + "reward_std": 0.007512320764362812, + "rewards//mean": 0.7520751953125, + "rewards//std": 0.02973257191479206, + "step": 3360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6722, + "grad_norm": 1.9029251337051392, + "kl": 0.8768682722002268, + "learning_rate": 2.472568725762853e-07, + "loss": 0.0877, + "num_tokens": 29046716.0, + "reward": 0.7327880859375, + "reward_std": 0.004658429883420467, + "rewards//mean": 0.7327880859375, + "rewards//std": 0.030423078685998917, + "step": 3361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6724, + "grad_norm": 8.497259140014648, + "kl": 1.1850367225706577, + "learning_rate": 2.469831179394182e-07, + "loss": 0.1185, + "num_tokens": 29055356.0, + "reward": 0.79144287109375, + "reward_std": 0.01158674992620945, + "rewards//mean": 0.79144287109375, + "rewards//std": 0.025944922119379044, + "step": 3362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6726, + "grad_norm": 1.493013858795166, + "kl": 1.2531673088669777, + "learning_rate": 2.467094652177209e-07, + "loss": 0.1253, + "num_tokens": 29064068.0, + "reward": 0.7388916015625, + "reward_std": 0.005088899750262499, + "rewards//mean": 0.7388916015625, + "rewards//std": 0.019700638949871063, + "step": 3363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6728, + "grad_norm": 1.873815655708313, + "kl": 0.7929435167461634, + "learning_rate": 2.464359145214207e-07, + "loss": 0.0793, + "num_tokens": 29072756.0, + "reward": 0.74029541015625, + "reward_std": 0.003975429572165012, + "rewards//mean": 0.74029541015625, + "rewards//std": 0.026675619184970856, + "step": 3364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.673, + "grad_norm": 1.3498497009277344, + "kl": 1.3159999530762434, + "learning_rate": 2.46162465960704e-07, + "loss": 0.1316, + "num_tokens": 29081300.0, + "reward": 0.73809814453125, + "reward_std": 0.0075944168493151665, + "rewards//mean": 0.73809814453125, + "rewards//std": 0.03169962391257286, + "step": 3365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6732, + "grad_norm": 5.301769733428955, + "kl": 1.039096849039197, + "learning_rate": 2.458891196457155e-07, + "loss": 0.1039, + "num_tokens": 29089884.0, + "reward": 0.74029541015625, + "reward_std": 0.005732233636081219, + "rewards//mean": 0.74029541015625, + "rewards//std": 0.025181017816066742, + "step": 3366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6734, + "grad_norm": 7.274405479431152, + "kl": 2.1661130636930466, + "learning_rate": 2.4561587568655924e-07, + "loss": 0.2166, + "num_tokens": 29098604.0, + "reward": 0.76409912109375, + "reward_std": 0.006481437012553215, + "rewards//mean": 0.76409912109375, + "rewards//std": 0.03418605402112007, + "step": 3367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6736, + "grad_norm": 2.9360175132751465, + "kl": 1.7305943351238966, + "learning_rate": 2.4534273419329775e-07, + "loss": 0.1731, + "num_tokens": 29107220.0, + "reward": 0.75811767578125, + "reward_std": 0.006910928059369326, + "rewards//mean": 0.75811767578125, + "rewards//std": 0.02010328881442547, + "step": 3368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6738, + "grad_norm": 1.0722098350524902, + "kl": 1.0907705649733543, + "learning_rate": 2.450696952759527e-07, + "loss": 0.1091, + "num_tokens": 29115788.0, + "reward": 0.72930908203125, + "reward_std": 0.006467911880463362, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.03076319582760334, + "step": 3369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.674, + "grad_norm": 3.2365825176239014, + "kl": 1.9809395596385002, + "learning_rate": 2.4479675904450376e-07, + "loss": 0.1981, + "num_tokens": 29124476.0, + "reward": 0.7584228515625, + "reward_std": 0.01058410108089447, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.03188087046146393, + "step": 3370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6742, + "grad_norm": 2.164930582046509, + "kl": 1.644496949389577, + "learning_rate": 2.4452392560888976e-07, + "loss": 0.1644, + "num_tokens": 29133100.0, + "reward": 0.77899169921875, + "reward_std": 0.013949566520750523, + "rewards//mean": 0.77899169921875, + "rewards//std": 0.028363775461912155, + "step": 3371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6744, + "grad_norm": 6.237517833709717, + "kl": 2.3561666291207075, + "learning_rate": 2.442511950790081e-07, + "loss": 0.2356, + "num_tokens": 29141684.0, + "reward": 0.7376708984375, + "reward_std": 0.013551203534007072, + "rewards//mean": 0.7376708984375, + "rewards//std": 0.03727442026138306, + "step": 3372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6746, + "grad_norm": 0.7245063781738281, + "kl": 0.8380575086921453, + "learning_rate": 2.439785675647143e-07, + "loss": 0.0838, + "num_tokens": 29150244.0, + "reward": 0.78173828125, + "reward_std": 0.004425089806318283, + "rewards//mean": 0.78173828125, + "rewards//std": 0.024165088310837746, + "step": 3373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6748, + "grad_norm": 3.565131187438965, + "kl": 1.4041677303612232, + "learning_rate": 2.4370604317582286e-07, + "loss": 0.1404, + "num_tokens": 29158932.0, + "reward": 0.76348876953125, + "reward_std": 0.012233685702085495, + "rewards//mean": 0.76348876953125, + "rewards//std": 0.03314511850476265, + "step": 3374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.675, + "grad_norm": 2.964259624481201, + "kl": 1.6570193227380514, + "learning_rate": 2.4343362202210667e-07, + "loss": 0.1657, + "num_tokens": 29167500.0, + "reward": 0.76776123046875, + "reward_std": 0.005918186157941818, + "rewards//mean": 0.76776123046875, + "rewards//std": 0.023498699069023132, + "step": 3375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6752, + "grad_norm": 1.2595579624176025, + "kl": 1.4089453779160976, + "learning_rate": 2.4316130421329696e-07, + "loss": 0.1409, + "num_tokens": 29176076.0, + "reward": 0.76715087890625, + "reward_std": 0.010600244626402855, + "rewards//mean": 0.76715087890625, + "rewards//std": 0.03493591025471687, + "step": 3376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6754, + "grad_norm": 3.9707398414611816, + "kl": 1.5365128219127655, + "learning_rate": 2.42889089859083e-07, + "loss": 0.1537, + "num_tokens": 29184860.0, + "reward": 0.75909423828125, + "reward_std": 0.00749754486605525, + "rewards//mean": 0.75909423828125, + "rewards//std": 0.021632445976138115, + "step": 3377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6756, + "grad_norm": 4.609278202056885, + "kl": 1.1338660642504692, + "learning_rate": 2.426169790691129e-07, + "loss": 0.1134, + "num_tokens": 29193484.0, + "reward": 0.74847412109375, + "reward_std": 0.006449209991842508, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.03185349702835083, + "step": 3378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6758, + "grad_norm": 8.429368019104004, + "kl": 2.514424927532673, + "learning_rate": 2.4234497195299287e-07, + "loss": 0.2514, + "num_tokens": 29202140.0, + "reward": 0.74755859375, + "reward_std": 0.012200634926557541, + "rewards//mean": 0.74755859375, + "rewards//std": 0.025195516645908356, + "step": 3379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.676, + "grad_norm": 6.260903358459473, + "kl": 0.8797468673437834, + "learning_rate": 2.4207306862028753e-07, + "loss": 0.088, + "num_tokens": 29210684.0, + "reward": 0.77294921875, + "reward_std": 0.0031914329156279564, + "rewards//mean": 0.77294921875, + "rewards//std": 0.028927693143486977, + "step": 3380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6762, + "grad_norm": 2.7050724029541016, + "kl": 1.5748549588024616, + "learning_rate": 2.418012691805191e-07, + "loss": 0.1575, + "num_tokens": 29219364.0, + "reward": 0.76483154296875, + "reward_std": 0.010899614542722702, + "rewards//mean": 0.76483154296875, + "rewards//std": 0.028375515714287758, + "step": 3381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6764, + "grad_norm": 1.760290503501892, + "kl": 0.8585295286029577, + "learning_rate": 2.4152957374316856e-07, + "loss": 0.0859, + "num_tokens": 29228036.0, + "reward": 0.76336669921875, + "reward_std": 0.0057825371623039246, + "rewards//mean": 0.76336669921875, + "rewards//std": 0.020265283063054085, + "step": 3382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6766, + "grad_norm": 5.1764092445373535, + "kl": 2.445761889219284, + "learning_rate": 2.412579824176748e-07, + "loss": 0.2446, + "num_tokens": 29236772.0, + "reward": 0.73138427734375, + "reward_std": 0.013500506058335304, + "rewards//mean": 0.73138427734375, + "rewards//std": 0.029378335922956467, + "step": 3383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6768, + "grad_norm": 0.9329017400741577, + "kl": 0.6388252954930067, + "learning_rate": 2.4098649531343494e-07, + "loss": 0.0639, + "num_tokens": 29245404.0, + "reward": 0.77484130859375, + "reward_std": 0.0018857249524444342, + "rewards//mean": 0.77484130859375, + "rewards//std": 0.02054496295750141, + "step": 3384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.677, + "grad_norm": 2.6711816787719727, + "kl": 0.7474976181983948, + "learning_rate": 2.407151125398036e-07, + "loss": 0.0747, + "num_tokens": 29253988.0, + "reward": 0.73211669921875, + "reward_std": 0.004077882505953312, + "rewards//mean": 0.73211669921875, + "rewards//std": 0.03512388840317726, + "step": 3385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6772, + "grad_norm": 1.6185574531555176, + "kl": 1.3612923603504896, + "learning_rate": 2.4044383420609406e-07, + "loss": 0.1361, + "num_tokens": 29262740.0, + "reward": 0.78118896484375, + "reward_std": 0.009773509576916695, + "rewards//mean": 0.78118896484375, + "rewards//std": 0.03338947147130966, + "step": 3386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6774, + "grad_norm": 9.484457969665527, + "kl": 2.4711567386984825, + "learning_rate": 2.4017266042157695e-07, + "loss": 0.2471, + "num_tokens": 29271436.0, + "reward": 0.7337646484375, + "reward_std": 0.01019862201064825, + "rewards//mean": 0.7337646484375, + "rewards//std": 0.035823144018650055, + "step": 3387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6776, + "grad_norm": 1.5974787473678589, + "kl": 1.3902613278478384, + "learning_rate": 2.3990159129548133e-07, + "loss": 0.139, + "num_tokens": 29280116.0, + "reward": 0.76324462890625, + "reward_std": 0.008438550867140293, + "rewards//mean": 0.76324462890625, + "rewards//std": 0.02748614177107811, + "step": 3388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6778, + "grad_norm": 0.9129928350448608, + "kl": 0.885471299290657, + "learning_rate": 2.396306269369935e-07, + "loss": 0.0885, + "num_tokens": 29288668.0, + "reward": 0.75567626953125, + "reward_std": 0.003517745528370142, + "rewards//mean": 0.75567626953125, + "rewards//std": 0.03150418773293495, + "step": 3389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.678, + "grad_norm": 2.9780983924865723, + "kl": 1.3422704469412565, + "learning_rate": 2.393597674552579e-07, + "loss": 0.1342, + "num_tokens": 29297244.0, + "reward": 0.73760986328125, + "reward_std": 0.0039279162883758545, + "rewards//mean": 0.73760986328125, + "rewards//std": 0.027086148038506508, + "step": 3390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6782, + "grad_norm": 0.3522569239139557, + "kl": 0.4426173698157072, + "learning_rate": 2.390890129593771e-07, + "loss": 0.0443, + "num_tokens": 29305900.0, + "reward": 0.78973388671875, + "reward_std": 0.0013152279425412416, + "rewards//mean": 0.78973388671875, + "rewards//std": 0.020978784188628197, + "step": 3391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6784, + "grad_norm": 0.4591626226902008, + "kl": 0.43246139772236347, + "learning_rate": 2.3881836355841045e-07, + "loss": 0.0432, + "num_tokens": 29314524.0, + "reward": 0.766845703125, + "reward_std": 0.0005712973070330918, + "rewards//mean": 0.766845703125, + "rewards//std": 0.02383585087954998, + "step": 3392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6786, + "grad_norm": 3.624645709991455, + "kl": 2.614571699872613, + "learning_rate": 2.3854781936137576e-07, + "loss": 0.2615, + "num_tokens": 29323148.0, + "reward": 0.79736328125, + "reward_std": 0.020335033535957336, + "rewards//mean": 0.79736328125, + "rewards//std": 0.03195888176560402, + "step": 3393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6788, + "grad_norm": 2.352867364883423, + "kl": 1.9935352392494678, + "learning_rate": 2.382773804772481e-07, + "loss": 0.1994, + "num_tokens": 29331772.0, + "reward": 0.76220703125, + "reward_std": 0.010856258682906628, + "rewards//mean": 0.76220703125, + "rewards//std": 0.029891250655055046, + "step": 3394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.679, + "grad_norm": 0.40034225583076477, + "kl": 0.45466030202805996, + "learning_rate": 2.380070470149605e-07, + "loss": 0.0455, + "num_tokens": 29340372.0, + "reward": 0.75067138671875, + "reward_std": 0.0009699611109681427, + "rewards//mean": 0.75067138671875, + "rewards//std": 0.02715758979320526, + "step": 3395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6792, + "grad_norm": 4.5532026290893555, + "kl": 2.282718777656555, + "learning_rate": 2.3773681908340282e-07, + "loss": 0.2283, + "num_tokens": 29349060.0, + "reward": 0.772705078125, + "reward_std": 0.01035287044942379, + "rewards//mean": 0.772705078125, + "rewards//std": 0.025054534897208214, + "step": 3396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6794, + "grad_norm": 3.0215342044830322, + "kl": 2.0789489187300205, + "learning_rate": 2.3746669679142312e-07, + "loss": 0.2079, + "num_tokens": 29357748.0, + "reward": 0.79876708984375, + "reward_std": 0.014160914346575737, + "rewards//mean": 0.79876708984375, + "rewards//std": 0.030465032905340195, + "step": 3397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6796, + "grad_norm": 8.799727439880371, + "kl": 2.822776285931468, + "learning_rate": 2.3719668024782647e-07, + "loss": 0.2823, + "num_tokens": 29366428.0, + "reward": 0.74224853515625, + "reward_std": 0.0160053763538599, + "rewards//mean": 0.74224853515625, + "rewards//std": 0.03987554460763931, + "step": 3398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6798, + "grad_norm": 9.087366104125977, + "kl": 3.0379379335790873, + "learning_rate": 2.369267695613758e-07, + "loss": 0.3038, + "num_tokens": 29375076.0, + "reward": 0.76739501953125, + "reward_std": 0.013445738703012466, + "rewards//mean": 0.76739501953125, + "rewards//std": 0.025647403672337532, + "step": 3399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.68, + "grad_norm": 3.181220769882202, + "kl": 1.6500026509165764, + "learning_rate": 2.3665696484079074e-07, + "loss": 0.165, + "num_tokens": 29383692.0, + "reward": 0.74432373046875, + "reward_std": 0.013830777257680893, + "rewards//mean": 0.74432373046875, + "rewards//std": 0.031962137669324875, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6802, + "grad_norm": 0.7625852823257446, + "kl": 0.7513136621564627, + "learning_rate": 2.3638726619474875e-07, + "loss": 0.0751, + "num_tokens": 29392220.0, + "reward": 0.758056640625, + "reward_std": 0.0030323986429721117, + "rewards//mean": 0.758056640625, + "rewards//std": 0.02529505454003811, + "step": 3401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6804, + "grad_norm": 9.181245803833008, + "kl": 1.1685108933597803, + "learning_rate": 2.361176737318844e-07, + "loss": 0.1169, + "num_tokens": 29400804.0, + "reward": 0.740966796875, + "reward_std": 0.005903527606278658, + "rewards//mean": 0.740966796875, + "rewards//std": 0.02826172299683094, + "step": 3402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6806, + "grad_norm": 0.866474449634552, + "kl": 1.147150119766593, + "learning_rate": 2.3584818756078968e-07, + "loss": 0.1147, + "num_tokens": 29409404.0, + "reward": 0.7652587890625, + "reward_std": 0.005608399864286184, + "rewards//mean": 0.7652587890625, + "rewards//std": 0.02635340578854084, + "step": 3403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6808, + "grad_norm": 1.0039067268371582, + "kl": 1.0509447287768126, + "learning_rate": 2.355788077900132e-07, + "loss": 0.1051, + "num_tokens": 29418116.0, + "reward": 0.74334716796875, + "reward_std": 0.004647059831768274, + "rewards//mean": 0.74334716796875, + "rewards//std": 0.03402315825223923, + "step": 3404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.681, + "grad_norm": 11.03630542755127, + "kl": 1.6488002054393291, + "learning_rate": 2.353095345280614e-07, + "loss": 0.1649, + "num_tokens": 29426748.0, + "reward": 0.7178955078125, + "reward_std": 0.00679437629878521, + "rewards//mean": 0.7178955078125, + "rewards//std": 0.029728500172495842, + "step": 3405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6812, + "grad_norm": 5.669913291931152, + "kl": 2.1606679391115904, + "learning_rate": 2.350403678833976e-07, + "loss": 0.2161, + "num_tokens": 29435372.0, + "reward": 0.76025390625, + "reward_std": 0.009974752552807331, + "rewards//mean": 0.76025390625, + "rewards//std": 0.022867362946271896, + "step": 3406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6814, + "grad_norm": 10.089519500732422, + "kl": 1.6384042985737324, + "learning_rate": 2.3477130796444173e-07, + "loss": 0.1638, + "num_tokens": 29444132.0, + "reward": 0.7431640625, + "reward_std": 0.003928555175662041, + "rewards//mean": 0.7431640625, + "rewards//std": 0.027374735102057457, + "step": 3407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6816, + "grad_norm": 2.213124990463257, + "kl": 1.1423519644886255, + "learning_rate": 2.3450235487957133e-07, + "loss": 0.1142, + "num_tokens": 29452660.0, + "reward": 0.7620849609375, + "reward_std": 0.0072975982911884785, + "rewards//mean": 0.7620849609375, + "rewards//std": 0.02418731525540352, + "step": 3408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6818, + "grad_norm": 0.7783774733543396, + "kl": 0.43877203948795795, + "learning_rate": 2.3423350873712054e-07, + "loss": 0.0439, + "num_tokens": 29461300.0, + "reward": 0.76446533203125, + "reward_std": 0.0014583036536350846, + "rewards//mean": 0.76446533203125, + "rewards//std": 0.01991415023803711, + "step": 3409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.682, + "grad_norm": 5.056840896606445, + "kl": 1.614849902689457, + "learning_rate": 2.3396476964538093e-07, + "loss": 0.1615, + "num_tokens": 29469932.0, + "reward": 0.76641845703125, + "reward_std": 0.010773098096251488, + "rewards//mean": 0.76641845703125, + "rewards//std": 0.0359443724155426, + "step": 3410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6822, + "grad_norm": 4.509110450744629, + "kl": 1.682168997824192, + "learning_rate": 2.3369613771260005e-07, + "loss": 0.1682, + "num_tokens": 29478556.0, + "reward": 0.7061767578125, + "reward_std": 0.014997019432485104, + "rewards//mean": 0.7061767578125, + "rewards//std": 0.04477386921644211, + "step": 3411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6824, + "grad_norm": 2.922154188156128, + "kl": 0.9267830327153206, + "learning_rate": 2.334276130469831e-07, + "loss": 0.0927, + "num_tokens": 29487164.0, + "reward": 0.7880859375, + "reward_std": 0.010617008432745934, + "rewards//mean": 0.7880859375, + "rewards//std": 0.02340647764503956, + "step": 3412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6826, + "grad_norm": 0.9967247843742371, + "kl": 0.6221606954932213, + "learning_rate": 2.331591957566917e-07, + "loss": 0.0622, + "num_tokens": 29495764.0, + "reward": 0.77288818359375, + "reward_std": 0.002766229910776019, + "rewards//mean": 0.77288818359375, + "rewards//std": 0.024318622425198555, + "step": 3413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6828, + "grad_norm": 3.7024216651916504, + "kl": 1.1487776562571526, + "learning_rate": 2.328908859498445e-07, + "loss": 0.1149, + "num_tokens": 29504324.0, + "reward": 0.7437744140625, + "reward_std": 0.006756445858627558, + "rewards//mean": 0.7437744140625, + "rewards//std": 0.02624288760125637, + "step": 3414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.683, + "grad_norm": 2.1466031074523926, + "kl": 0.5881429314613342, + "learning_rate": 2.3262268373451637e-07, + "loss": 0.0588, + "num_tokens": 29512956.0, + "reward": 0.7904052734375, + "reward_std": 0.002071601804345846, + "rewards//mean": 0.7904052734375, + "rewards//std": 0.017444567754864693, + "step": 3415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6832, + "grad_norm": 2.218020439147949, + "kl": 1.493228729814291, + "learning_rate": 2.3235458921873923e-07, + "loss": 0.1493, + "num_tokens": 29521564.0, + "reward": 0.750732421875, + "reward_std": 0.008425773121416569, + "rewards//mean": 0.750732421875, + "rewards//std": 0.03546064719557762, + "step": 3416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6834, + "grad_norm": 2.709882974624634, + "kl": 0.9368243142962456, + "learning_rate": 2.3208660251050156e-07, + "loss": 0.0937, + "num_tokens": 29530244.0, + "reward": 0.72564697265625, + "reward_std": 0.006954542826861143, + "rewards//mean": 0.72564697265625, + "rewards//std": 0.0332968533039093, + "step": 3417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6836, + "grad_norm": 4.682353973388672, + "kl": 1.5745956003665924, + "learning_rate": 2.3181872371774853e-07, + "loss": 0.1575, + "num_tokens": 29538836.0, + "reward": 0.73187255859375, + "reward_std": 0.006184516940265894, + "rewards//mean": 0.73187255859375, + "rewards//std": 0.024472510442137718, + "step": 3418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6838, + "grad_norm": 2.588865280151367, + "kl": 2.3100268989801407, + "learning_rate": 2.3155095294838133e-07, + "loss": 0.231, + "num_tokens": 29547452.0, + "reward": 0.72186279296875, + "reward_std": 0.020480819046497345, + "rewards//mean": 0.72186279296875, + "rewards//std": 0.04030261188745499, + "step": 3419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.684, + "grad_norm": 4.484951019287109, + "kl": 2.0096003264188766, + "learning_rate": 2.3128329031025818e-07, + "loss": 0.201, + "num_tokens": 29556204.0, + "reward": 0.7879638671875, + "reward_std": 0.016406118869781494, + "rewards//mean": 0.7879638671875, + "rewards//std": 0.035238612443208694, + "step": 3420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6842, + "grad_norm": 2.283473491668701, + "kl": 1.1008089631795883, + "learning_rate": 2.310157359111938e-07, + "loss": 0.1101, + "num_tokens": 29564876.0, + "reward": 0.7523193359375, + "reward_std": 0.005407180171459913, + "rewards//mean": 0.7523193359375, + "rewards//std": 0.028812075033783913, + "step": 3421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6844, + "grad_norm": 5.860591888427734, + "kl": 2.1785377357155085, + "learning_rate": 2.3074828985895855e-07, + "loss": 0.2179, + "num_tokens": 29573652.0, + "reward": 0.77313232421875, + "reward_std": 0.010938340798020363, + "rewards//mean": 0.77313232421875, + "rewards//std": 0.04749764874577522, + "step": 3422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6846, + "grad_norm": 1.1679402589797974, + "kl": 0.7052957583218813, + "learning_rate": 2.3048095226128017e-07, + "loss": 0.0705, + "num_tokens": 29582300.0, + "reward": 0.77728271484375, + "reward_std": 0.0028641957323998213, + "rewards//mean": 0.77728271484375, + "rewards//std": 0.01806221716105938, + "step": 3423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6848, + "grad_norm": 3.9721193313598633, + "kl": 1.3363281898200512, + "learning_rate": 2.3021372322584183e-07, + "loss": 0.1336, + "num_tokens": 29590900.0, + "reward": 0.7579345703125, + "reward_std": 0.008450223132967949, + "rewards//mean": 0.7579345703125, + "rewards//std": 0.026171263307332993, + "step": 3424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.685, + "grad_norm": 3.4043588638305664, + "kl": 1.1704448498785496, + "learning_rate": 2.2994660286028345e-07, + "loss": 0.117, + "num_tokens": 29599532.0, + "reward": 0.76654052734375, + "reward_std": 0.012459220364689827, + "rewards//mean": 0.76654052734375, + "rewards//std": 0.0383475124835968, + "step": 3425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6852, + "grad_norm": 2.6654860973358154, + "kl": 1.4456407595425844, + "learning_rate": 2.2967959127220137e-07, + "loss": 0.1446, + "num_tokens": 29608244.0, + "reward": 0.73944091796875, + "reward_std": 0.007310228887945414, + "rewards//mean": 0.73944091796875, + "rewards//std": 0.024858538061380386, + "step": 3426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6854, + "grad_norm": 1.1171154975891113, + "kl": 1.0357328671962023, + "learning_rate": 2.2941268856914743e-07, + "loss": 0.1036, + "num_tokens": 29616948.0, + "reward": 0.7396240234375, + "reward_std": 0.006334719248116016, + "rewards//mean": 0.7396240234375, + "rewards//std": 0.030163230374455452, + "step": 3427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6856, + "grad_norm": 7.681259632110596, + "kl": 2.380631636828184, + "learning_rate": 2.2914589485863012e-07, + "loss": 0.2381, + "num_tokens": 29625644.0, + "reward": 0.7449951171875, + "reward_std": 0.01836434006690979, + "rewards//mean": 0.7449951171875, + "rewards//std": 0.03356294333934784, + "step": 3428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6858, + "grad_norm": 1.572418212890625, + "kl": 1.2912851199507713, + "learning_rate": 2.2887921024811402e-07, + "loss": 0.1291, + "num_tokens": 29634252.0, + "reward": 0.774658203125, + "reward_std": 0.008324958384037018, + "rewards//mean": 0.774658203125, + "rewards//std": 0.02487991936504841, + "step": 3429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.686, + "grad_norm": 3.2029104232788086, + "kl": 0.9079039469361305, + "learning_rate": 2.2861263484501974e-07, + "loss": 0.0908, + "num_tokens": 29642836.0, + "reward": 0.72137451171875, + "reward_std": 0.0046462430618703365, + "rewards//mean": 0.72137451171875, + "rewards//std": 0.033954571932554245, + "step": 3430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6862, + "grad_norm": 3.04046368598938, + "kl": 1.5037508327513933, + "learning_rate": 2.283461687567236e-07, + "loss": 0.1504, + "num_tokens": 29651476.0, + "reward": 0.733642578125, + "reward_std": 0.004632361698895693, + "rewards//mean": 0.733642578125, + "rewards//std": 0.030256683006882668, + "step": 3431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6864, + "grad_norm": 6.381411075592041, + "kl": 1.488565793260932, + "learning_rate": 2.280798120905581e-07, + "loss": 0.1489, + "num_tokens": 29660084.0, + "reward": 0.74920654296875, + "reward_std": 0.0050214966759085655, + "rewards//mean": 0.74920654296875, + "rewards//std": 0.04097606986761093, + "step": 3432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6866, + "grad_norm": 4.50383996963501, + "kl": 2.655251756310463, + "learning_rate": 2.278135649538118e-07, + "loss": 0.2655, + "num_tokens": 29668732.0, + "reward": 0.76788330078125, + "reward_std": 0.01365876104682684, + "rewards//mean": 0.76788330078125, + "rewards//std": 0.022083982825279236, + "step": 3433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6868, + "grad_norm": 2.905688524246216, + "kl": 1.2532624956220388, + "learning_rate": 2.275474274537292e-07, + "loss": 0.1253, + "num_tokens": 29677452.0, + "reward": 0.746826171875, + "reward_std": 0.007784257642924786, + "rewards//mean": 0.746826171875, + "rewards//std": 0.03215061128139496, + "step": 3434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.687, + "grad_norm": 4.906466007232666, + "kl": 1.471953809261322, + "learning_rate": 2.2728139969751003e-07, + "loss": 0.1472, + "num_tokens": 29686124.0, + "reward": 0.7803955078125, + "reward_std": 0.004127781838178635, + "rewards//mean": 0.7803955078125, + "rewards//std": 0.02497313730418682, + "step": 3435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6872, + "grad_norm": 1.8104697465896606, + "kl": 1.1774574387818575, + "learning_rate": 2.2701548179231046e-07, + "loss": 0.1177, + "num_tokens": 29694780.0, + "reward": 0.75677490234375, + "reward_std": 0.004315837286412716, + "rewards//mean": 0.75677490234375, + "rewards//std": 0.019278401508927345, + "step": 3436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6874, + "grad_norm": 3.1200742721557617, + "kl": 1.244502855464816, + "learning_rate": 2.2674967384524234e-07, + "loss": 0.1245, + "num_tokens": 29703332.0, + "reward": 0.75726318359375, + "reward_std": 0.01385971438139677, + "rewards//mean": 0.75726318359375, + "rewards//std": 0.03370263800024986, + "step": 3437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6876, + "grad_norm": 2.1987907886505127, + "kl": 1.1508431360125542, + "learning_rate": 2.2648397596337276e-07, + "loss": 0.1151, + "num_tokens": 29711948.0, + "reward": 0.75787353515625, + "reward_std": 0.008378233760595322, + "rewards//mean": 0.75787353515625, + "rewards//std": 0.02914450503885746, + "step": 3438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6878, + "grad_norm": 2.943302631378174, + "kl": 1.3111402317881584, + "learning_rate": 2.262183882537249e-07, + "loss": 0.1311, + "num_tokens": 29720668.0, + "reward": 0.75335693359375, + "reward_std": 0.01017959788441658, + "rewards//mean": 0.75335693359375, + "rewards//std": 0.031847793608903885, + "step": 3439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.688, + "grad_norm": 2.790065288543701, + "kl": 1.9738186970353127, + "learning_rate": 2.2595291082327762e-07, + "loss": 0.1974, + "num_tokens": 29729292.0, + "reward": 0.77166748046875, + "reward_std": 0.01846879906952381, + "rewards//mean": 0.77166748046875, + "rewards//std": 0.0284075066447258, + "step": 3440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6882, + "grad_norm": 1.2378202676773071, + "kl": 0.9104319382458925, + "learning_rate": 2.2568754377896515e-07, + "loss": 0.091, + "num_tokens": 29737836.0, + "reward": 0.76873779296875, + "reward_std": 0.006842778064310551, + "rewards//mean": 0.76873779296875, + "rewards//std": 0.02427189238369465, + "step": 3441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6884, + "grad_norm": 1.2944035530090332, + "kl": 0.6032543014734983, + "learning_rate": 2.2542228722767714e-07, + "loss": 0.0603, + "num_tokens": 29746516.0, + "reward": 0.7513427734375, + "reward_std": 0.004162848927080631, + "rewards//mean": 0.7513427734375, + "rewards//std": 0.02274756319820881, + "step": 3442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6886, + "grad_norm": 1.2229467630386353, + "kl": 1.1748757865279913, + "learning_rate": 2.2515714127625897e-07, + "loss": 0.1175, + "num_tokens": 29755116.0, + "reward": 0.7579345703125, + "reward_std": 0.00951804593205452, + "rewards//mean": 0.7579345703125, + "rewards//std": 0.034827250987291336, + "step": 3443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6888, + "grad_norm": 2.093184232711792, + "kl": 1.2586719617247581, + "learning_rate": 2.2489210603151144e-07, + "loss": 0.1259, + "num_tokens": 29763788.0, + "reward": 0.7451171875, + "reward_std": 0.005815399810671806, + "rewards//mean": 0.7451171875, + "rewards//std": 0.026730112731456757, + "step": 3444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.689, + "grad_norm": 5.375389099121094, + "kl": 2.543575966730714, + "learning_rate": 2.2462718160019083e-07, + "loss": 0.2544, + "num_tokens": 29772500.0, + "reward": 0.7581787109375, + "reward_std": 0.018001504242420197, + "rewards//mean": 0.7581787109375, + "rewards//std": 0.037069171667099, + "step": 3445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6892, + "grad_norm": 13.179305076599121, + "kl": 2.4546472672373056, + "learning_rate": 2.2436236808900844e-07, + "loss": 0.2455, + "num_tokens": 29781164.0, + "reward": 0.75, + "reward_std": 0.008823180571198463, + "rewards//mean": 0.75, + "rewards//std": 0.02772638387978077, + "step": 3446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6894, + "grad_norm": 1.9171929359436035, + "kl": 0.8235469274222851, + "learning_rate": 2.2409766560463118e-07, + "loss": 0.0824, + "num_tokens": 29789860.0, + "reward": 0.7313232421875, + "reward_std": 0.004143203608691692, + "rewards//mean": 0.7313232421875, + "rewards//std": 0.026725297793745995, + "step": 3447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6896, + "grad_norm": 1.5478960275650024, + "kl": 1.3127049840986729, + "learning_rate": 2.238330742536812e-07, + "loss": 0.1313, + "num_tokens": 29798612.0, + "reward": 0.7452392578125, + "reward_std": 0.006193601060658693, + "rewards//mean": 0.7452392578125, + "rewards//std": 0.01971600204706192, + "step": 3448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6898, + "grad_norm": 5.369910717010498, + "kl": 2.7993185967206955, + "learning_rate": 2.235685941427361e-07, + "loss": 0.2799, + "num_tokens": 29807228.0, + "reward": 0.73876953125, + "reward_std": 0.012811808846890926, + "rewards//mean": 0.73876953125, + "rewards//std": 0.032470136880874634, + "step": 3449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.69, + "grad_norm": 7.30379056930542, + "kl": 2.0506974514573812, + "learning_rate": 2.23304225378328e-07, + "loss": 0.2051, + "num_tokens": 29815892.0, + "reward": 0.7467041015625, + "reward_std": 0.008631259202957153, + "rewards//mean": 0.7467041015625, + "rewards//std": 0.035435665398836136, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6902, + "grad_norm": 5.265349864959717, + "kl": 2.1737058740109205, + "learning_rate": 2.2303996806694486e-07, + "loss": 0.2174, + "num_tokens": 29824460.0, + "reward": 0.75830078125, + "reward_std": 0.011730147525668144, + "rewards//mean": 0.75830078125, + "rewards//std": 0.034135591238737106, + "step": 3451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6904, + "grad_norm": 7.097456932067871, + "kl": 1.820293853059411, + "learning_rate": 2.227758223150296e-07, + "loss": 0.182, + "num_tokens": 29833092.0, + "reward": 0.76959228515625, + "reward_std": 0.007021708879619837, + "rewards//mean": 0.76959228515625, + "rewards//std": 0.0243745818734169, + "step": 3452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6906, + "grad_norm": 2.8215932846069336, + "kl": 0.8802655562758446, + "learning_rate": 2.2251178822897987e-07, + "loss": 0.088, + "num_tokens": 29841684.0, + "reward": 0.76611328125, + "reward_std": 0.0054050348699092865, + "rewards//mean": 0.76611328125, + "rewards//std": 0.02663480117917061, + "step": 3453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6908, + "grad_norm": 2.0751430988311768, + "kl": 0.7627586293965578, + "learning_rate": 2.222478659151486e-07, + "loss": 0.0763, + "num_tokens": 29850356.0, + "reward": 0.74560546875, + "reward_std": 0.003952002618461847, + "rewards//mean": 0.74560546875, + "rewards//std": 0.025310609489679337, + "step": 3454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.691, + "grad_norm": 6.305367469787598, + "kl": 1.7149363122880459, + "learning_rate": 2.2198405547984371e-07, + "loss": 0.1715, + "num_tokens": 29858956.0, + "reward": 0.7662353515625, + "reward_std": 0.007152431644499302, + "rewards//mean": 0.7662353515625, + "rewards//std": 0.030877452343702316, + "step": 3455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6912, + "grad_norm": 2.8520452976226807, + "kl": 1.6952407341450453, + "learning_rate": 2.2172035702932823e-07, + "loss": 0.1695, + "num_tokens": 29867580.0, + "reward": 0.7239990234375, + "reward_std": 0.01271811779588461, + "rewards//mean": 0.7239990234375, + "rewards//std": 0.026668597012758255, + "step": 3456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6914, + "grad_norm": 4.984213829040527, + "kl": 1.3043750114738941, + "learning_rate": 2.2145677066981945e-07, + "loss": 0.1304, + "num_tokens": 29876068.0, + "reward": 0.744140625, + "reward_std": 0.007074702996760607, + "rewards//mean": 0.744140625, + "rewards//std": 0.04353712126612663, + "step": 3457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6916, + "grad_norm": 0.9789470434188843, + "kl": 0.7499772869050503, + "learning_rate": 2.2119329650749018e-07, + "loss": 0.075, + "num_tokens": 29884700.0, + "reward": 0.721923828125, + "reward_std": 0.001389256096445024, + "rewards//mean": 0.721923828125, + "rewards//std": 0.024497317150235176, + "step": 3458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6918, + "grad_norm": 4.345032691955566, + "kl": 1.048671878874302, + "learning_rate": 2.209299346484677e-07, + "loss": 0.1049, + "num_tokens": 29893364.0, + "reward": 0.77777099609375, + "reward_std": 0.006858708802610636, + "rewards//mean": 0.77777099609375, + "rewards//std": 0.028545185923576355, + "step": 3459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.692, + "grad_norm": 2.8153746128082275, + "kl": 1.120677251368761, + "learning_rate": 2.2066668519883436e-07, + "loss": 0.1121, + "num_tokens": 29902108.0, + "reward": 0.784423828125, + "reward_std": 0.006089504808187485, + "rewards//mean": 0.784423828125, + "rewards//std": 0.024218900129199028, + "step": 3460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6922, + "grad_norm": 2.2207815647125244, + "kl": 0.7675087656825781, + "learning_rate": 2.2040354826462664e-07, + "loss": 0.0768, + "num_tokens": 29910716.0, + "reward": 0.8231201171875, + "reward_std": 0.0036769742146134377, + "rewards//mean": 0.8231201171875, + "rewards//std": 0.013140609487891197, + "step": 3461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6924, + "grad_norm": 2.247210741043091, + "kl": 1.3911301456391811, + "learning_rate": 2.2014052395183623e-07, + "loss": 0.1391, + "num_tokens": 29919324.0, + "reward": 0.73529052734375, + "reward_std": 0.006952348630875349, + "rewards//mean": 0.73529052734375, + "rewards//std": 0.025370271876454353, + "step": 3462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6926, + "grad_norm": 7.882184028625488, + "kl": 1.5191735364496708, + "learning_rate": 2.1987761236640933e-07, + "loss": 0.1519, + "num_tokens": 29928028.0, + "reward": 0.75970458984375, + "reward_std": 0.008088035508990288, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.031604453921318054, + "step": 3463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6928, + "grad_norm": 1.2043476104736328, + "kl": 0.6854917872697115, + "learning_rate": 2.1961481361424683e-07, + "loss": 0.0685, + "num_tokens": 29936644.0, + "reward": 0.72528076171875, + "reward_std": 0.004411098547279835, + "rewards//mean": 0.72528076171875, + "rewards//std": 0.01820330135524273, + "step": 3464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.693, + "grad_norm": 3.9071271419525146, + "kl": 1.1319369841367006, + "learning_rate": 2.1935212780120365e-07, + "loss": 0.1132, + "num_tokens": 29945292.0, + "reward": 0.76409912109375, + "reward_std": 0.01105588674545288, + "rewards//mean": 0.76409912109375, + "rewards//std": 0.026799043640494347, + "step": 3465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6932, + "grad_norm": 7.860217094421387, + "kl": 1.8732512965798378, + "learning_rate": 2.190895550330899e-07, + "loss": 0.1873, + "num_tokens": 29953932.0, + "reward": 0.72113037109375, + "reward_std": 0.006328887306153774, + "rewards//mean": 0.72113037109375, + "rewards//std": 0.033208541572093964, + "step": 3466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6934, + "grad_norm": 1.5396133661270142, + "kl": 1.2919855285435915, + "learning_rate": 2.1882709541566996e-07, + "loss": 0.1292, + "num_tokens": 29962564.0, + "reward": 0.77099609375, + "reward_std": 0.00985817238688469, + "rewards//mean": 0.77099609375, + "rewards//std": 0.029302041977643967, + "step": 3467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6936, + "grad_norm": 1.5052257776260376, + "kl": 0.843318386003375, + "learning_rate": 2.1856474905466215e-07, + "loss": 0.0843, + "num_tokens": 29971156.0, + "reward": 0.7652587890625, + "reward_std": 0.0044374060817062855, + "rewards//mean": 0.7652587890625, + "rewards//std": 0.025004638358950615, + "step": 3468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6938, + "grad_norm": 3.576735019683838, + "kl": 0.8178714849054813, + "learning_rate": 2.1830251605573978e-07, + "loss": 0.0818, + "num_tokens": 29979860.0, + "reward": 0.7789306640625, + "reward_std": 0.0022976321633905172, + "rewards//mean": 0.7789306640625, + "rewards//std": 0.019284386187791824, + "step": 3469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.694, + "grad_norm": 1.5532572269439697, + "kl": 1.7233679872006178, + "learning_rate": 2.1804039652453028e-07, + "loss": 0.1723, + "num_tokens": 29988524.0, + "reward": 0.78045654296875, + "reward_std": 0.014759990386664867, + "rewards//mean": 0.78045654296875, + "rewards//std": 0.03006640449166298, + "step": 3470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6942, + "grad_norm": 12.861505508422852, + "kl": 1.9418034851551056, + "learning_rate": 2.177783905666155e-07, + "loss": 0.1942, + "num_tokens": 29997180.0, + "reward": 0.7236328125, + "reward_std": 0.008128653280436993, + "rewards//mean": 0.7236328125, + "rewards//std": 0.023704661056399345, + "step": 3471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6944, + "grad_norm": 1.8464299440383911, + "kl": 1.5258444529026747, + "learning_rate": 2.1751649828753106e-07, + "loss": 0.1526, + "num_tokens": 30005780.0, + "reward": 0.78125, + "reward_std": 0.01334303617477417, + "rewards//mean": 0.78125, + "rewards//std": 0.028202742338180542, + "step": 3472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6946, + "grad_norm": 2.8565826416015625, + "kl": 0.9910435434430838, + "learning_rate": 2.1725471979276734e-07, + "loss": 0.0991, + "num_tokens": 30014420.0, + "reward": 0.75384521484375, + "reward_std": 0.008092589676380157, + "rewards//mean": 0.75384521484375, + "rewards//std": 0.026834040880203247, + "step": 3473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6948, + "grad_norm": 1.9449750185012817, + "kl": 1.4546810742467642, + "learning_rate": 2.1699305518776868e-07, + "loss": 0.1455, + "num_tokens": 30023100.0, + "reward": 0.74609375, + "reward_std": 0.009887555614113808, + "rewards//mean": 0.74609375, + "rewards//std": 0.02063474990427494, + "step": 3474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.695, + "grad_norm": 6.253121376037598, + "kl": 2.8578324895352125, + "learning_rate": 2.1673150457793372e-07, + "loss": 0.2858, + "num_tokens": 30031708.0, + "reward": 0.76165771484375, + "reward_std": 0.014417762868106365, + "rewards//mean": 0.76165771484375, + "rewards//std": 0.03879179805517197, + "step": 3475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6952, + "grad_norm": 4.932842254638672, + "kl": 1.1664124876260757, + "learning_rate": 2.1647006806861469e-07, + "loss": 0.1166, + "num_tokens": 30040332.0, + "reward": 0.74200439453125, + "reward_std": 0.004347790032625198, + "rewards//mean": 0.74200439453125, + "rewards//std": 0.028496889397501945, + "step": 3476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6954, + "grad_norm": 0.8569820523262024, + "kl": 0.682934133335948, + "learning_rate": 2.1620874576511827e-07, + "loss": 0.0683, + "num_tokens": 30049036.0, + "reward": 0.759765625, + "reward_std": 0.002297632396221161, + "rewards//mean": 0.759765625, + "rewards//std": 0.031149081885814667, + "step": 3477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6956, + "grad_norm": 3.7145166397094727, + "kl": 1.11059539206326, + "learning_rate": 2.1594753777270513e-07, + "loss": 0.1111, + "num_tokens": 30057628.0, + "reward": 0.7496337890625, + "reward_std": 0.004676534794270992, + "rewards//mean": 0.7496337890625, + "rewards//std": 0.029722388833761215, + "step": 3478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6958, + "grad_norm": 19.450992584228516, + "kl": 4.172765189781785, + "learning_rate": 2.1568644419659003e-07, + "loss": 0.4173, + "num_tokens": 30066428.0, + "reward": 0.7474365234375, + "reward_std": 0.01303660124540329, + "rewards//mean": 0.7474365234375, + "rewards//std": 0.03164448216557503, + "step": 3479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.696, + "grad_norm": 4.5634942054748535, + "kl": 1.7840466015040874, + "learning_rate": 2.15425465141941e-07, + "loss": 0.1784, + "num_tokens": 30075180.0, + "reward": 0.71954345703125, + "reward_std": 0.009584732353687286, + "rewards//mean": 0.71954345703125, + "rewards//std": 0.03796307370066643, + "step": 3480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6962, + "grad_norm": 4.216037273406982, + "kl": 1.5700954366475344, + "learning_rate": 2.151646007138806e-07, + "loss": 0.157, + "num_tokens": 30083788.0, + "reward": 0.76458740234375, + "reward_std": 0.009255477227270603, + "rewards//mean": 0.76458740234375, + "rewards//std": 0.036566074937582016, + "step": 3481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6964, + "grad_norm": 3.278015613555908, + "kl": 1.9326049387454987, + "learning_rate": 2.1490385101748516e-07, + "loss": 0.1933, + "num_tokens": 30092452.0, + "reward": 0.76837158203125, + "reward_std": 0.016047392040491104, + "rewards//mean": 0.76837158203125, + "rewards//std": 0.034545931965112686, + "step": 3482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6966, + "grad_norm": 8.028961181640625, + "kl": 1.6088963449001312, + "learning_rate": 2.146432161577842e-07, + "loss": 0.1609, + "num_tokens": 30101036.0, + "reward": 0.76690673828125, + "reward_std": 0.006371825933456421, + "rewards//mean": 0.76690673828125, + "rewards//std": 0.031528204679489136, + "step": 3483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6968, + "grad_norm": 10.71892261505127, + "kl": 1.6825184784829617, + "learning_rate": 2.1438269623976168e-07, + "loss": 0.1683, + "num_tokens": 30109684.0, + "reward": 0.71807861328125, + "reward_std": 0.001547522027976811, + "rewards//mean": 0.71807861328125, + "rewards//std": 0.04109632596373558, + "step": 3484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.697, + "grad_norm": 2.5602171421051025, + "kl": 0.9569313321262598, + "learning_rate": 2.1412229136835497e-07, + "loss": 0.0957, + "num_tokens": 30118380.0, + "reward": 0.73321533203125, + "reward_std": 0.00275570061057806, + "rewards//mean": 0.73321533203125, + "rewards//std": 0.03565005213022232, + "step": 3485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6972, + "grad_norm": 4.997867107391357, + "kl": 1.4349370654672384, + "learning_rate": 2.1386200164845525e-07, + "loss": 0.1435, + "num_tokens": 30127108.0, + "reward": 0.765869140625, + "reward_std": 0.00639419024810195, + "rewards//mean": 0.765869140625, + "rewards//std": 0.027960164472460747, + "step": 3486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6974, + "grad_norm": 1.2759207487106323, + "kl": 1.696100426837802, + "learning_rate": 2.1360182718490689e-07, + "loss": 0.1696, + "num_tokens": 30135708.0, + "reward": 0.76019287109375, + "reward_std": 0.009268540889024734, + "rewards//mean": 0.76019287109375, + "rewards//std": 0.026916276663541794, + "step": 3487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6976, + "grad_norm": 2.6065776348114014, + "kl": 0.9505104515701532, + "learning_rate": 2.133417680825083e-07, + "loss": 0.0951, + "num_tokens": 30144324.0, + "reward": 0.7637939453125, + "reward_std": 0.008462205529212952, + "rewards//mean": 0.7637939453125, + "rewards//std": 0.033262114971876144, + "step": 3488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6978, + "grad_norm": 1.3784343004226685, + "kl": 1.2509192377328873, + "learning_rate": 2.1308182444601126e-07, + "loss": 0.1251, + "num_tokens": 30152964.0, + "reward": 0.784912109375, + "reward_std": 0.004959351848810911, + "rewards//mean": 0.784912109375, + "rewards//std": 0.026021895930171013, + "step": 3489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.698, + "grad_norm": 1.396880865097046, + "kl": 0.6012505199760199, + "learning_rate": 2.1282199638012116e-07, + "loss": 0.0601, + "num_tokens": 30161620.0, + "reward": 0.763427734375, + "reward_std": 0.0019523652736097574, + "rewards//mean": 0.763427734375, + "rewards//std": 0.026354841887950897, + "step": 3490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6982, + "grad_norm": 1.72381591796875, + "kl": 1.598063638433814, + "learning_rate": 2.125622839894964e-07, + "loss": 0.1598, + "num_tokens": 30170156.0, + "reward": 0.76593017578125, + "reward_std": 0.0056663984432816505, + "rewards//mean": 0.76593017578125, + "rewards//std": 0.025375641882419586, + "step": 3491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6984, + "grad_norm": 5.226526260375977, + "kl": 1.5482505392283201, + "learning_rate": 2.123026873787493e-07, + "loss": 0.1548, + "num_tokens": 30178804.0, + "reward": 0.74761962890625, + "reward_std": 0.011557972058653831, + "rewards//mean": 0.74761962890625, + "rewards//std": 0.02622348442673683, + "step": 3492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6986, + "grad_norm": 10.070154190063477, + "kl": 2.8559236358851194, + "learning_rate": 2.120432066524453e-07, + "loss": 0.2856, + "num_tokens": 30187564.0, + "reward": 0.763427734375, + "reward_std": 0.013345044106245041, + "rewards//mean": 0.763427734375, + "rewards//std": 0.04666721448302269, + "step": 3493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6988, + "grad_norm": 9.271356582641602, + "kl": 1.765527592971921, + "learning_rate": 2.117838419151034e-07, + "loss": 0.1766, + "num_tokens": 30196388.0, + "reward": 0.74041748046875, + "reward_std": 0.003790999762713909, + "rewards//mean": 0.74041748046875, + "rewards//std": 0.038304463028907776, + "step": 3494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.699, + "grad_norm": 0.9614835381507874, + "kl": 0.6439705770462751, + "learning_rate": 2.1152459327119537e-07, + "loss": 0.0644, + "num_tokens": 30204956.0, + "reward": 0.76947021484375, + "reward_std": 0.002850499702617526, + "rewards//mean": 0.76947021484375, + "rewards//std": 0.0234838780015707, + "step": 3495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6992, + "grad_norm": 3.066046714782715, + "kl": 2.8912434317171574, + "learning_rate": 2.1126546082514663e-07, + "loss": 0.2891, + "num_tokens": 30213580.0, + "reward": 0.75390625, + "reward_std": 0.021674014627933502, + "rewards//mean": 0.75390625, + "rewards//std": 0.038146503269672394, + "step": 3496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6994, + "grad_norm": 6.683073043823242, + "kl": 1.911673991009593, + "learning_rate": 2.1100644468133573e-07, + "loss": 0.1912, + "num_tokens": 30222212.0, + "reward": 0.74139404296875, + "reward_std": 0.007996300235390663, + "rewards//mean": 0.74139404296875, + "rewards//std": 0.03384830057621002, + "step": 3497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6996, + "grad_norm": 4.304509162902832, + "kl": 1.937950311228633, + "learning_rate": 2.1074754494409457e-07, + "loss": 0.1938, + "num_tokens": 30230876.0, + "reward": 0.7689208984375, + "reward_std": 0.00999192614108324, + "rewards//mean": 0.7689208984375, + "rewards//std": 0.03593115508556366, + "step": 3498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.6998, + "grad_norm": 3.4273970127105713, + "kl": 0.9889365285634995, + "learning_rate": 2.104887617177075e-07, + "loss": 0.0989, + "num_tokens": 30239468.0, + "reward": 0.74334716796875, + "reward_std": 0.006039255298674107, + "rewards//mean": 0.74334716796875, + "rewards//std": 0.027758508920669556, + "step": 3499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7, + "grad_norm": 3.067169666290283, + "kl": 1.706845983862877, + "learning_rate": 2.1023009510641264e-07, + "loss": 0.1707, + "num_tokens": 30248180.0, + "reward": 0.75958251953125, + "reward_std": 0.01109134778380394, + "rewards//mean": 0.75958251953125, + "rewards//std": 0.038012485951185226, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7002, + "grad_norm": 2.382382392883301, + "kl": 1.4565277434885502, + "learning_rate": 2.0997154521440097e-07, + "loss": 0.1457, + "num_tokens": 30256756.0, + "reward": 0.72833251953125, + "reward_std": 0.010684599168598652, + "rewards//mean": 0.72833251953125, + "rewards//std": 0.03215855732560158, + "step": 3501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7004, + "grad_norm": 4.419671535491943, + "kl": 1.7320016026496887, + "learning_rate": 2.0971311214581598e-07, + "loss": 0.1732, + "num_tokens": 30265428.0, + "reward": 0.76824951171875, + "reward_std": 0.009989997372031212, + "rewards//mean": 0.76824951171875, + "rewards//std": 0.033703986555337906, + "step": 3502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7006, + "grad_norm": 11.491801261901855, + "kl": 2.0549047123640776, + "learning_rate": 2.0945479600475479e-07, + "loss": 0.2055, + "num_tokens": 30274076.0, + "reward": 0.7685546875, + "reward_std": 0.0032665084581822157, + "rewards//mean": 0.7685546875, + "rewards//std": 0.03355955705046654, + "step": 3503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7008, + "grad_norm": 3.4434001445770264, + "kl": 1.3798725791275501, + "learning_rate": 2.0919659689526698e-07, + "loss": 0.138, + "num_tokens": 30282668.0, + "reward": 0.75738525390625, + "reward_std": 0.013209821656346321, + "rewards//mean": 0.75738525390625, + "rewards//std": 0.028722817078232765, + "step": 3504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.701, + "grad_norm": 3.7417056560516357, + "kl": 2.3717891965061426, + "learning_rate": 2.0893851492135532e-07, + "loss": 0.2372, + "num_tokens": 30291308.0, + "reward": 0.76080322265625, + "reward_std": 0.013125067576766014, + "rewards//mean": 0.76080322265625, + "rewards//std": 0.02864365465939045, + "step": 3505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7012, + "grad_norm": 1.928330659866333, + "kl": 0.9749567937105894, + "learning_rate": 2.086805501869749e-07, + "loss": 0.0975, + "num_tokens": 30299948.0, + "reward": 0.7264404296875, + "reward_std": 0.003130007069557905, + "rewards//mean": 0.7264404296875, + "rewards//std": 0.026729827746748924, + "step": 3506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7014, + "grad_norm": 9.627230644226074, + "kl": 2.1386900395154953, + "learning_rate": 2.08422702796034e-07, + "loss": 0.2139, + "num_tokens": 30308668.0, + "reward": 0.7315673828125, + "reward_std": 0.006532980129122734, + "rewards//mean": 0.7315673828125, + "rewards//std": 0.03392184153199196, + "step": 3507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7016, + "grad_norm": 9.382357597351074, + "kl": 1.7511019222438335, + "learning_rate": 2.081649728523937e-07, + "loss": 0.1751, + "num_tokens": 30317324.0, + "reward": 0.72222900390625, + "reward_std": 0.007671227678656578, + "rewards//mean": 0.72222900390625, + "rewards//std": 0.03401203081011772, + "step": 3508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7018, + "grad_norm": 4.734748363494873, + "kl": 1.3464430086314678, + "learning_rate": 2.0790736045986734e-07, + "loss": 0.1346, + "num_tokens": 30325940.0, + "reward": 0.773193359375, + "reward_std": 0.009204069152474403, + "rewards//mean": 0.773193359375, + "rewards//std": 0.028922459110617638, + "step": 3509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.702, + "grad_norm": 6.072880744934082, + "kl": 1.7570385318249464, + "learning_rate": 2.0764986572222137e-07, + "loss": 0.1757, + "num_tokens": 30334492.0, + "reward": 0.77093505859375, + "reward_std": 0.013170243240892887, + "rewards//mean": 0.77093505859375, + "rewards//std": 0.029259584844112396, + "step": 3510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7022, + "grad_norm": 2.7795445919036865, + "kl": 1.383143663406372, + "learning_rate": 2.0739248874317438e-07, + "loss": 0.1383, + "num_tokens": 30343244.0, + "reward": 0.74908447265625, + "reward_std": 0.009015443734824657, + "rewards//mean": 0.74908447265625, + "rewards//std": 0.034735146909952164, + "step": 3511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7024, + "grad_norm": 5.081666946411133, + "kl": 2.357322560623288, + "learning_rate": 2.071352296263979e-07, + "loss": 0.2357, + "num_tokens": 30352012.0, + "reward": 0.7293701171875, + "reward_std": 0.012278255075216293, + "rewards//mean": 0.7293701171875, + "rewards//std": 0.032245345413684845, + "step": 3512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7026, + "grad_norm": 1.4532724618911743, + "kl": 1.1720546334981918, + "learning_rate": 2.0687808847551607e-07, + "loss": 0.1172, + "num_tokens": 30360580.0, + "reward": 0.77874755859375, + "reward_std": 0.0055155279114842415, + "rewards//mean": 0.77874755859375, + "rewards//std": 0.024353455752134323, + "step": 3513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7028, + "grad_norm": 1.0417137145996094, + "kl": 0.8058001361787319, + "learning_rate": 2.06621065394105e-07, + "loss": 0.0806, + "num_tokens": 30369244.0, + "reward": 0.75421142578125, + "reward_std": 0.0036253032740205526, + "rewards//mean": 0.75421142578125, + "rewards//std": 0.02612806297838688, + "step": 3514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.703, + "grad_norm": 1.1218796968460083, + "kl": 0.624186834320426, + "learning_rate": 2.0636416048569373e-07, + "loss": 0.0624, + "num_tokens": 30377868.0, + "reward": 0.77764892578125, + "reward_std": 0.0032280958257615566, + "rewards//mean": 0.77764892578125, + "rewards//std": 0.020520634949207306, + "step": 3515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7032, + "grad_norm": 3.5602364540100098, + "kl": 1.110467653721571, + "learning_rate": 2.0610737385376348e-07, + "loss": 0.111, + "num_tokens": 30386508.0, + "reward": 0.7305908203125, + "reward_std": 0.006905339192599058, + "rewards//mean": 0.7305908203125, + "rewards//std": 0.037622056901454926, + "step": 3516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7034, + "grad_norm": 2.749885320663452, + "kl": 1.3325198628008366, + "learning_rate": 2.0585070560174806e-07, + "loss": 0.1333, + "num_tokens": 30395188.0, + "reward": 0.72589111328125, + "reward_std": 0.004099859390407801, + "rewards//mean": 0.72589111328125, + "rewards//std": 0.02596300281584263, + "step": 3517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7036, + "grad_norm": 10.855755805969238, + "kl": 2.1784789003431797, + "learning_rate": 2.0559415583303307e-07, + "loss": 0.2178, + "num_tokens": 30403756.0, + "reward": 0.77325439453125, + "reward_std": 0.009535331279039383, + "rewards//mean": 0.77325439453125, + "rewards//std": 0.03129205480217934, + "step": 3518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7038, + "grad_norm": 10.527831077575684, + "kl": 2.086433619260788, + "learning_rate": 2.0533772465095688e-07, + "loss": 0.2086, + "num_tokens": 30412428.0, + "reward": 0.73138427734375, + "reward_std": 0.01129913330078125, + "rewards//mean": 0.73138427734375, + "rewards//std": 0.042139146476984024, + "step": 3519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.704, + "grad_norm": 7.8280816078186035, + "kl": 2.23949015699327, + "learning_rate": 2.0508141215881004e-07, + "loss": 0.2239, + "num_tokens": 30420988.0, + "reward": 0.74725341796875, + "reward_std": 0.011834340170025826, + "rewards//mean": 0.74725341796875, + "rewards//std": 0.0335211418569088, + "step": 3520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7042, + "grad_norm": 2.7287213802337646, + "kl": 0.9647390451282263, + "learning_rate": 2.048252184598352e-07, + "loss": 0.0965, + "num_tokens": 30429620.0, + "reward": 0.757568359375, + "reward_std": 0.007763028610497713, + "rewards//mean": 0.757568359375, + "rewards//std": 0.02815009281039238, + "step": 3521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7044, + "grad_norm": 5.407227993011475, + "kl": 2.1152491718530655, + "learning_rate": 2.0456914365722695e-07, + "loss": 0.2115, + "num_tokens": 30438340.0, + "reward": 0.7529296875, + "reward_std": 0.009901592507958412, + "rewards//mean": 0.7529296875, + "rewards//std": 0.030805055052042007, + "step": 3522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7046, + "grad_norm": 2.825568437576294, + "kl": 1.8381923697888851, + "learning_rate": 2.0431318785413228e-07, + "loss": 0.1838, + "num_tokens": 30446988.0, + "reward": 0.76885986328125, + "reward_std": 0.01419929601252079, + "rewards//mean": 0.76885986328125, + "rewards//std": 0.03796546533703804, + "step": 3523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7048, + "grad_norm": 2.4637844562530518, + "kl": 0.7617932930588722, + "learning_rate": 2.040573511536502e-07, + "loss": 0.0762, + "num_tokens": 30455604.0, + "reward": 0.7451171875, + "reward_std": 0.008604532107710838, + "rewards//mean": 0.7451171875, + "rewards//std": 0.03493856266140938, + "step": 3524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.705, + "grad_norm": 14.037069320678711, + "kl": 3.0105540826916695, + "learning_rate": 2.0380163365883184e-07, + "loss": 0.3011, + "num_tokens": 30464348.0, + "reward": 0.78472900390625, + "reward_std": 0.01689021848142147, + "rewards//mean": 0.78472900390625, + "rewards//std": 0.04055982828140259, + "step": 3525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7052, + "grad_norm": 3.618311882019043, + "kl": 1.1247129421681166, + "learning_rate": 2.0354603547267984e-07, + "loss": 0.1125, + "num_tokens": 30472980.0, + "reward": 0.76629638671875, + "reward_std": 0.005283651873469353, + "rewards//mean": 0.76629638671875, + "rewards//std": 0.027041401714086533, + "step": 3526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7054, + "grad_norm": 1.8932256698608398, + "kl": 0.7749376855790615, + "learning_rate": 2.0329055669814933e-07, + "loss": 0.0775, + "num_tokens": 30481604.0, + "reward": 0.78717041015625, + "reward_std": 0.004963844083249569, + "rewards//mean": 0.78717041015625, + "rewards//std": 0.032726410776376724, + "step": 3527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7056, + "grad_norm": 1.7759898900985718, + "kl": 0.918415017426014, + "learning_rate": 2.0303519743814724e-07, + "loss": 0.0918, + "num_tokens": 30490148.0, + "reward": 0.721923828125, + "reward_std": 0.005008267238736153, + "rewards//mean": 0.721923828125, + "rewards//std": 0.032710738480091095, + "step": 3528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7058, + "grad_norm": 1.6960872411727905, + "kl": 1.2913768850266933, + "learning_rate": 2.027799577955319e-07, + "loss": 0.1291, + "num_tokens": 30498772.0, + "reward": 0.74090576171875, + "reward_std": 0.008712713606655598, + "rewards//mean": 0.74090576171875, + "rewards//std": 0.028314633294939995, + "step": 3529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.706, + "grad_norm": 1.233095407485962, + "kl": 1.2744024842977524, + "learning_rate": 2.0252483787311408e-07, + "loss": 0.1274, + "num_tokens": 30507340.0, + "reward": 0.7371826171875, + "reward_std": 0.006915587931871414, + "rewards//mean": 0.7371826171875, + "rewards//std": 0.035560186952352524, + "step": 3530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7062, + "grad_norm": 0.9527837634086609, + "kl": 0.7655998487025499, + "learning_rate": 2.0226983777365603e-07, + "loss": 0.0766, + "num_tokens": 30515948.0, + "reward": 0.76556396484375, + "reward_std": 0.004008385818451643, + "rewards//mean": 0.76556396484375, + "rewards//std": 0.02926062047481537, + "step": 3531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7064, + "grad_norm": 1.8402091264724731, + "kl": 1.4890003986656666, + "learning_rate": 2.020149575998718e-07, + "loss": 0.1489, + "num_tokens": 30524644.0, + "reward": 0.751220703125, + "reward_std": 0.006608171388506889, + "rewards//mean": 0.751220703125, + "rewards//std": 0.03342124819755554, + "step": 3532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7066, + "grad_norm": 3.5732803344726562, + "kl": 1.1703779511153698, + "learning_rate": 2.017601974544269e-07, + "loss": 0.117, + "num_tokens": 30533236.0, + "reward": 0.7542724609375, + "reward_std": 0.009705103933811188, + "rewards//mean": 0.7542724609375, + "rewards//std": 0.03513019159436226, + "step": 3533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7068, + "grad_norm": 2.7504029273986816, + "kl": 1.360819136723876, + "learning_rate": 2.0150555743993873e-07, + "loss": 0.1361, + "num_tokens": 30541860.0, + "reward": 0.745849609375, + "reward_std": 0.0070299264043569565, + "rewards//mean": 0.745849609375, + "rewards//std": 0.032120466232299805, + "step": 3534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.707, + "grad_norm": 4.756690979003906, + "kl": 1.7864049952477217, + "learning_rate": 2.012510376589764e-07, + "loss": 0.1786, + "num_tokens": 30550548.0, + "reward": 0.7666015625, + "reward_std": 0.0077532436698675156, + "rewards//mean": 0.7666015625, + "rewards//std": 0.026484334841370583, + "step": 3535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7072, + "grad_norm": 2.8534812927246094, + "kl": 1.3615676425397396, + "learning_rate": 2.0099663821406055e-07, + "loss": 0.1362, + "num_tokens": 30559204.0, + "reward": 0.72906494140625, + "reward_std": 0.010225416161119938, + "rewards//mean": 0.72906494140625, + "rewards//std": 0.03383532539010048, + "step": 3536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7074, + "grad_norm": 1.937111496925354, + "kl": 0.7819320261478424, + "learning_rate": 2.0074235920766285e-07, + "loss": 0.0782, + "num_tokens": 30567804.0, + "reward": 0.77020263671875, + "reward_std": 0.004196600988507271, + "rewards//mean": 0.77020263671875, + "rewards//std": 0.019961223006248474, + "step": 3537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7076, + "grad_norm": 0.7658914923667908, + "kl": 0.8175707943737507, + "learning_rate": 2.0048820074220711e-07, + "loss": 0.0818, + "num_tokens": 30576468.0, + "reward": 0.7454833984375, + "reward_std": 0.00352125964127481, + "rewards//mean": 0.7454833984375, + "rewards//std": 0.02982407435774803, + "step": 3538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7078, + "grad_norm": 1.746757984161377, + "kl": 1.6477007921785116, + "learning_rate": 2.0023416292006828e-07, + "loss": 0.1648, + "num_tokens": 30585052.0, + "reward": 0.75048828125, + "reward_std": 0.010070405900478363, + "rewards//mean": 0.75048828125, + "rewards//std": 0.031004898250102997, + "step": 3539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.708, + "grad_norm": 3.3701798915863037, + "kl": 1.1526755560189486, + "learning_rate": 1.9998024584357293e-07, + "loss": 0.1153, + "num_tokens": 30593748.0, + "reward": 0.77557373046875, + "reward_std": 0.007845836691558361, + "rewards//mean": 0.77557373046875, + "rewards//std": 0.02717876248061657, + "step": 3540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7082, + "grad_norm": 1.8436521291732788, + "kl": 1.255329955369234, + "learning_rate": 1.9972644961499853e-07, + "loss": 0.1255, + "num_tokens": 30602404.0, + "reward": 0.74761962890625, + "reward_std": 0.006460602395236492, + "rewards//mean": 0.74761962890625, + "rewards//std": 0.025549832731485367, + "step": 3541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7084, + "grad_norm": 2.6338465213775635, + "kl": 1.8792631588876247, + "learning_rate": 1.994727743365743e-07, + "loss": 0.1879, + "num_tokens": 30610988.0, + "reward": 0.76397705078125, + "reward_std": 0.01392243430018425, + "rewards//mean": 0.76397705078125, + "rewards//std": 0.033695001155138016, + "step": 3542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7086, + "grad_norm": 3.640620470046997, + "kl": 1.4376482851803303, + "learning_rate": 1.9921922011048063e-07, + "loss": 0.1438, + "num_tokens": 30619572.0, + "reward": 0.77996826171875, + "reward_std": 0.009651240892708302, + "rewards//mean": 0.77996826171875, + "rewards//std": 0.028442654758691788, + "step": 3543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7088, + "grad_norm": 3.5918664932250977, + "kl": 1.8058200161904097, + "learning_rate": 1.989657870388493e-07, + "loss": 0.1806, + "num_tokens": 30628260.0, + "reward": 0.7593994140625, + "reward_std": 0.009511411190032959, + "rewards//mean": 0.7593994140625, + "rewards//std": 0.027950145304203033, + "step": 3544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.709, + "grad_norm": 4.690072536468506, + "kl": 1.6613571681082249, + "learning_rate": 1.9871247522376277e-07, + "loss": 0.1661, + "num_tokens": 30636964.0, + "reward": 0.75762939453125, + "reward_std": 0.01626511663198471, + "rewards//mean": 0.75762939453125, + "rewards//std": 0.033287305384874344, + "step": 3545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7092, + "grad_norm": 3.2033276557922363, + "kl": 2.018960013985634, + "learning_rate": 1.9845928476725522e-07, + "loss": 0.2019, + "num_tokens": 30645660.0, + "reward": 0.77655029296875, + "reward_std": 0.00996097456663847, + "rewards//mean": 0.77655029296875, + "rewards//std": 0.031181564554572105, + "step": 3546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7094, + "grad_norm": 1.5670267343521118, + "kl": 1.7156179938465357, + "learning_rate": 1.9820621577131186e-07, + "loss": 0.1716, + "num_tokens": 30654284.0, + "reward": 0.7802734375, + "reward_std": 0.00882802251726389, + "rewards//mean": 0.7802734375, + "rewards//std": 0.027761302888393402, + "step": 3547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7096, + "grad_norm": 1.5676097869873047, + "kl": 1.3604740127921104, + "learning_rate": 1.9795326833786852e-07, + "loss": 0.136, + "num_tokens": 30662900.0, + "reward": 0.76611328125, + "reward_std": 0.009017514996230602, + "rewards//mean": 0.76611328125, + "rewards//std": 0.025157036259770393, + "step": 3548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7098, + "grad_norm": 1.6342328786849976, + "kl": 1.6767144091427326, + "learning_rate": 1.9770044256881258e-07, + "loss": 0.1677, + "num_tokens": 30671548.0, + "reward": 0.78515625, + "reward_std": 0.01596813276410103, + "rewards//mean": 0.78515625, + "rewards//std": 0.02975725568830967, + "step": 3549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.71, + "grad_norm": 2.5721688270568848, + "kl": 1.6526990626007318, + "learning_rate": 1.9744773856598224e-07, + "loss": 0.1653, + "num_tokens": 30680140.0, + "reward": 0.74542236328125, + "reward_std": 0.01387165579944849, + "rewards//mean": 0.74542236328125, + "rewards//std": 0.03564240783452988, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7102, + "grad_norm": 2.8021602630615234, + "kl": 1.493527203798294, + "learning_rate": 1.9719515643116674e-07, + "loss": 0.1494, + "num_tokens": 30688772.0, + "reward": 0.7255859375, + "reward_std": 0.00987439975142479, + "rewards//mean": 0.7255859375, + "rewards//std": 0.03113352507352829, + "step": 3551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7104, + "grad_norm": 2.1547632217407227, + "kl": 0.7602411434054375, + "learning_rate": 1.9694269626610588e-07, + "loss": 0.076, + "num_tokens": 30697500.0, + "reward": 0.71734619140625, + "reward_std": 0.003970570396631956, + "rewards//mean": 0.71734619140625, + "rewards//std": 0.03283262625336647, + "step": 3552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7106, + "grad_norm": 3.6139791011810303, + "kl": 1.7620182689279318, + "learning_rate": 1.9669035817249074e-07, + "loss": 0.1762, + "num_tokens": 30706204.0, + "reward": 0.740478515625, + "reward_std": 0.013333121314644814, + "rewards//mean": 0.740478515625, + "rewards//std": 0.0297480970621109, + "step": 3553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7108, + "grad_norm": 1.7233213186264038, + "kl": 0.8104466572403908, + "learning_rate": 1.9643814225196304e-07, + "loss": 0.081, + "num_tokens": 30714844.0, + "reward": 0.7425537109375, + "reward_std": 0.0031074027065187693, + "rewards//mean": 0.7425537109375, + "rewards//std": 0.027354544028639793, + "step": 3554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.711, + "grad_norm": 8.618120193481445, + "kl": 1.2111565601080656, + "learning_rate": 1.9618604860611554e-07, + "loss": 0.1211, + "num_tokens": 30723484.0, + "reward": 0.76593017578125, + "reward_std": 0.006958568934351206, + "rewards//mean": 0.76593017578125, + "rewards//std": 0.02895953133702278, + "step": 3555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7112, + "grad_norm": 4.357668399810791, + "kl": 1.2392793390899897, + "learning_rate": 1.959340773364911e-07, + "loss": 0.1239, + "num_tokens": 30732148.0, + "reward": 0.77386474609375, + "reward_std": 0.005772817879915237, + "rewards//mean": 0.77386474609375, + "rewards//std": 0.024020498618483543, + "step": 3556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7114, + "grad_norm": 5.551623344421387, + "kl": 1.7733281943947077, + "learning_rate": 1.95682228544584e-07, + "loss": 0.1773, + "num_tokens": 30740788.0, + "reward": 0.76263427734375, + "reward_std": 0.00975467637181282, + "rewards//mean": 0.76263427734375, + "rewards//std": 0.025569967925548553, + "step": 3557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7116, + "grad_norm": 2.8201558589935303, + "kl": 1.0933802891522646, + "learning_rate": 1.9543050233183878e-07, + "loss": 0.1093, + "num_tokens": 30749428.0, + "reward": 0.7666015625, + "reward_std": 0.00929994136095047, + "rewards//mean": 0.7666015625, + "rewards//std": 0.030281687155365944, + "step": 3558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7118, + "grad_norm": 7.5262651443481445, + "kl": 1.7966040633618832, + "learning_rate": 1.9517889879965104e-07, + "loss": 0.1797, + "num_tokens": 30758084.0, + "reward": 0.75469970703125, + "reward_std": 0.007460195571184158, + "rewards//mean": 0.75469970703125, + "rewards//std": 0.03001803159713745, + "step": 3559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.712, + "grad_norm": 2.6144073009490967, + "kl": 1.3635978270322084, + "learning_rate": 1.9492741804936618e-07, + "loss": 0.1364, + "num_tokens": 30766748.0, + "reward": 0.761474609375, + "reward_std": 0.01152757741510868, + "rewards//mean": 0.761474609375, + "rewards//std": 0.02437838539481163, + "step": 3560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7122, + "grad_norm": 3.463202953338623, + "kl": 1.6324880551546812, + "learning_rate": 1.9467606018228088e-07, + "loss": 0.1632, + "num_tokens": 30775412.0, + "reward": 0.77410888671875, + "reward_std": 0.012970506213605404, + "rewards//mean": 0.77410888671875, + "rewards//std": 0.03890440985560417, + "step": 3561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7124, + "grad_norm": 4.92953634262085, + "kl": 0.7262371871620417, + "learning_rate": 1.9442482529964222e-07, + "loss": 0.0726, + "num_tokens": 30784044.0, + "reward": 0.7679443359375, + "reward_std": 0.0049894810654222965, + "rewards//mean": 0.7679443359375, + "rewards//std": 0.022017134353518486, + "step": 3562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7126, + "grad_norm": 6.6041669845581055, + "kl": 1.9079391658306122, + "learning_rate": 1.9417371350264716e-07, + "loss": 0.1908, + "num_tokens": 30792812.0, + "reward": 0.77484130859375, + "reward_std": 0.011755731888115406, + "rewards//mean": 0.77484130859375, + "rewards//std": 0.030332572758197784, + "step": 3563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7128, + "grad_norm": 5.212818622589111, + "kl": 1.3849430661648512, + "learning_rate": 1.9392272489244377e-07, + "loss": 0.1385, + "num_tokens": 30801572.0, + "reward": 0.76416015625, + "reward_std": 0.007776933256536722, + "rewards//mean": 0.76416015625, + "rewards//std": 0.031692519783973694, + "step": 3564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.713, + "grad_norm": 6.331071853637695, + "kl": 1.420283379033208, + "learning_rate": 1.936718595701302e-07, + "loss": 0.142, + "num_tokens": 30810276.0, + "reward": 0.757080078125, + "reward_std": 0.007385469973087311, + "rewards//mean": 0.757080078125, + "rewards//std": 0.03816475346684456, + "step": 3565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7132, + "grad_norm": 8.7510986328125, + "kl": 1.719766629859805, + "learning_rate": 1.934211176367551e-07, + "loss": 0.172, + "num_tokens": 30818908.0, + "reward": 0.719970703125, + "reward_std": 0.01402165088802576, + "rewards//mean": 0.719970703125, + "rewards//std": 0.036570265889167786, + "step": 3566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7134, + "grad_norm": 3.2203545570373535, + "kl": 1.6549810208380222, + "learning_rate": 1.9317049919331702e-07, + "loss": 0.1655, + "num_tokens": 30827588.0, + "reward": 0.79046630859375, + "reward_std": 0.014554335735738277, + "rewards//mean": 0.79046630859375, + "rewards//std": 0.022621022537350655, + "step": 3567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7136, + "grad_norm": 4.697467803955078, + "kl": 1.4587569441646338, + "learning_rate": 1.929200043407651e-07, + "loss": 0.1459, + "num_tokens": 30836276.0, + "reward": 0.75433349609375, + "reward_std": 0.005141293630003929, + "rewards//mean": 0.75433349609375, + "rewards//std": 0.013782014138996601, + "step": 3568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7138, + "grad_norm": 0.25584834814071655, + "kl": 0.4285982269793749, + "learning_rate": 1.926696331799988e-07, + "loss": 0.0429, + "num_tokens": 30844924.0, + "reward": 0.75970458984375, + "reward_std": 0.0005179004510864615, + "rewards//mean": 0.75970458984375, + "rewards//std": 0.02958577126264572, + "step": 3569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.714, + "grad_norm": 2.684601306915283, + "kl": 1.2960066515952349, + "learning_rate": 1.9241938581186762e-07, + "loss": 0.1296, + "num_tokens": 30853660.0, + "reward": 0.76220703125, + "reward_std": 0.007334734778851271, + "rewards//mean": 0.76220703125, + "rewards//std": 0.03466365858912468, + "step": 3570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7142, + "grad_norm": 2.9744272232055664, + "kl": 0.7932864539325237, + "learning_rate": 1.9216926233717084e-07, + "loss": 0.0793, + "num_tokens": 30862332.0, + "reward": 0.76824951171875, + "reward_std": 0.0030366291757673025, + "rewards//mean": 0.76824951171875, + "rewards//std": 0.019162626937031746, + "step": 3571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7144, + "grad_norm": 2.1167335510253906, + "kl": 1.47066586650908, + "learning_rate": 1.9191926285665843e-07, + "loss": 0.1471, + "num_tokens": 30871108.0, + "reward": 0.76263427734375, + "reward_std": 0.007572601083666086, + "rewards//mean": 0.76263427734375, + "rewards//std": 0.03128673508763313, + "step": 3572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7146, + "grad_norm": 2.0134670734405518, + "kl": 1.6664400901645422, + "learning_rate": 1.9166938747103012e-07, + "loss": 0.1666, + "num_tokens": 30879828.0, + "reward": 0.76971435546875, + "reward_std": 0.010514755733311176, + "rewards//mean": 0.76971435546875, + "rewards//std": 0.03610069677233696, + "step": 3573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7148, + "grad_norm": 2.6136438846588135, + "kl": 0.7064992226660252, + "learning_rate": 1.9141963628093582e-07, + "loss": 0.0706, + "num_tokens": 30888476.0, + "reward": 0.79437255859375, + "reward_std": 0.003807157278060913, + "rewards//mean": 0.79437255859375, + "rewards//std": 0.02163594402372837, + "step": 3574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.715, + "grad_norm": 3.235882043838501, + "kl": 0.6693189088255167, + "learning_rate": 1.911700093869749e-07, + "loss": 0.0669, + "num_tokens": 30896988.0, + "reward": 0.7322998046875, + "reward_std": 0.004552899859845638, + "rewards//mean": 0.7322998046875, + "rewards//std": 0.03268180042505264, + "step": 3575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7152, + "grad_norm": 2.545570135116577, + "kl": 0.8713650051504374, + "learning_rate": 1.9092050688969736e-07, + "loss": 0.0871, + "num_tokens": 30905572.0, + "reward": 0.75054931640625, + "reward_std": 0.003458252642303705, + "rewards//mean": 0.75054931640625, + "rewards//std": 0.023896660655736923, + "step": 3576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7154, + "grad_norm": 1.2350611686706543, + "kl": 0.6518172193318605, + "learning_rate": 1.906711288896028e-07, + "loss": 0.0652, + "num_tokens": 30914188.0, + "reward": 0.7630615234375, + "reward_std": 0.0037447321228682995, + "rewards//mean": 0.7630615234375, + "rewards//std": 0.02756183221936226, + "step": 3577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7156, + "grad_norm": 2.469587802886963, + "kl": 2.0603863578289747, + "learning_rate": 1.9042187548714033e-07, + "loss": 0.206, + "num_tokens": 30922844.0, + "reward": 0.77349853515625, + "reward_std": 0.013504520989954472, + "rewards//mean": 0.77349853515625, + "rewards//std": 0.033979080617427826, + "step": 3578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7158, + "grad_norm": 2.3967666625976562, + "kl": 1.5268173851072788, + "learning_rate": 1.9017274678270945e-07, + "loss": 0.1527, + "num_tokens": 30931484.0, + "reward": 0.75531005859375, + "reward_std": 0.011130311526358128, + "rewards//mean": 0.75531005859375, + "rewards//std": 0.037887636572122574, + "step": 3579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.716, + "grad_norm": 3.234113931655884, + "kl": 1.5628940593451262, + "learning_rate": 1.8992374287665908e-07, + "loss": 0.1563, + "num_tokens": 30940148.0, + "reward": 0.73828125, + "reward_std": 0.010607191361486912, + "rewards//mean": 0.73828125, + "rewards//std": 0.043011005967855453, + "step": 3580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7162, + "grad_norm": 2.079681158065796, + "kl": 1.9411836136132479, + "learning_rate": 1.8967486386928817e-07, + "loss": 0.1941, + "num_tokens": 30948756.0, + "reward": 0.77056884765625, + "reward_std": 0.014757277444005013, + "rewards//mean": 0.77056884765625, + "rewards//std": 0.026723245158791542, + "step": 3581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7164, + "grad_norm": 2.9132089614868164, + "kl": 2.1134427580982447, + "learning_rate": 1.8942610986084484e-07, + "loss": 0.2113, + "num_tokens": 30957372.0, + "reward": 0.76605224609375, + "reward_std": 0.01430192869156599, + "rewards//mean": 0.76605224609375, + "rewards//std": 0.030505748465657234, + "step": 3582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7166, + "grad_norm": 6.0774827003479, + "kl": 1.6655714623630047, + "learning_rate": 1.891774809515273e-07, + "loss": 0.1666, + "num_tokens": 30966140.0, + "reward": 0.72808837890625, + "reward_std": 0.013912977650761604, + "rewards//mean": 0.72808837890625, + "rewards//std": 0.03684568777680397, + "step": 3583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7168, + "grad_norm": 6.967377662658691, + "kl": 1.2876193821430206, + "learning_rate": 1.8892897724148322e-07, + "loss": 0.1288, + "num_tokens": 30974716.0, + "reward": 0.7569580078125, + "reward_std": 0.005702922120690346, + "rewards//mean": 0.7569580078125, + "rewards//std": 0.030285436660051346, + "step": 3584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.717, + "grad_norm": 3.150099039077759, + "kl": 1.205721653997898, + "learning_rate": 1.8868059883081011e-07, + "loss": 0.1206, + "num_tokens": 30983468.0, + "reward": 0.73773193359375, + "reward_std": 0.013264201581478119, + "rewards//mean": 0.73773193359375, + "rewards//std": 0.03595574200153351, + "step": 3585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7172, + "grad_norm": 2.8095881938934326, + "kl": 0.9386508874595165, + "learning_rate": 1.8843234581955441e-07, + "loss": 0.0939, + "num_tokens": 30992092.0, + "reward": 0.75457763671875, + "reward_std": 0.008838159963488579, + "rewards//mean": 0.75457763671875, + "rewards//std": 0.023036781698465347, + "step": 3586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7174, + "grad_norm": 3.888221025466919, + "kl": 1.5757380612194538, + "learning_rate": 1.8818421830771252e-07, + "loss": 0.1576, + "num_tokens": 31000956.0, + "reward": 0.753173828125, + "reward_std": 0.012731881812214851, + "rewards//mean": 0.753173828125, + "rewards//std": 0.03199959173798561, + "step": 3587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7176, + "grad_norm": 4.2185211181640625, + "kl": 1.0350936073809862, + "learning_rate": 1.8793621639523027e-07, + "loss": 0.1035, + "num_tokens": 31009604.0, + "reward": 0.78009033203125, + "reward_std": 0.007864845916628838, + "rewards//mean": 0.78009033203125, + "rewards//std": 0.0253696758300066, + "step": 3588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7178, + "grad_norm": 6.849427700042725, + "kl": 1.9080579746514559, + "learning_rate": 1.8768834018200286e-07, + "loss": 0.1908, + "num_tokens": 31018228.0, + "reward": 0.7540283203125, + "reward_std": 0.009546862915158272, + "rewards//mean": 0.7540283203125, + "rewards//std": 0.02725030854344368, + "step": 3589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.718, + "grad_norm": 3.007052183151245, + "kl": 1.2691441141068935, + "learning_rate": 1.8744058976787452e-07, + "loss": 0.1269, + "num_tokens": 31026796.0, + "reward": 0.7584228515625, + "reward_std": 0.005669381935149431, + "rewards//mean": 0.7584228515625, + "rewards//std": 0.027605734765529633, + "step": 3590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7182, + "grad_norm": 1.762382984161377, + "kl": 0.6956273671239614, + "learning_rate": 1.8719296525263923e-07, + "loss": 0.0696, + "num_tokens": 31035316.0, + "reward": 0.76025390625, + "reward_std": 0.0010158405639231205, + "rewards//mean": 0.76025390625, + "rewards//std": 0.026213163509964943, + "step": 3591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7184, + "grad_norm": 2.150631904602051, + "kl": 0.7484397012740374, + "learning_rate": 1.869454667360401e-07, + "loss": 0.0748, + "num_tokens": 31043956.0, + "reward": 0.79132080078125, + "reward_std": 0.003866758430376649, + "rewards//mean": 0.79132080078125, + "rewards//std": 0.024367747828364372, + "step": 3592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7186, + "grad_norm": 1.969803810119629, + "kl": 1.5698446221649647, + "learning_rate": 1.8669809431776988e-07, + "loss": 0.157, + "num_tokens": 31052556.0, + "reward": 0.72906494140625, + "reward_std": 0.011136680841445923, + "rewards//mean": 0.72906494140625, + "rewards//std": 0.03492724150419235, + "step": 3593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7188, + "grad_norm": 5.577622413635254, + "kl": 1.617896307259798, + "learning_rate": 1.8645084809746952e-07, + "loss": 0.1618, + "num_tokens": 31061244.0, + "reward": 0.77459716796875, + "reward_std": 0.012375066056847572, + "rewards//mean": 0.77459716796875, + "rewards//std": 0.03988882899284363, + "step": 3594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.719, + "grad_norm": 1.9326990842819214, + "kl": 0.7960346397012472, + "learning_rate": 1.8620372817473002e-07, + "loss": 0.0796, + "num_tokens": 31069820.0, + "reward": 0.7052001953125, + "reward_std": 0.0022622612304985523, + "rewards//mean": 0.7052001953125, + "rewards//std": 0.029846400022506714, + "step": 3595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7192, + "grad_norm": 2.846069097518921, + "kl": 0.9224927444010973, + "learning_rate": 1.859567346490913e-07, + "loss": 0.0922, + "num_tokens": 31078396.0, + "reward": 0.7637939453125, + "reward_std": 0.007131369784474373, + "rewards//mean": 0.7637939453125, + "rewards//std": 0.02403916046023369, + "step": 3596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7194, + "grad_norm": 2.773770332336426, + "kl": 1.9939765352755785, + "learning_rate": 1.8570986762004242e-07, + "loss": 0.1994, + "num_tokens": 31087004.0, + "reward": 0.7359619140625, + "reward_std": 0.01489239651709795, + "rewards//mean": 0.7359619140625, + "rewards//std": 0.03768477588891983, + "step": 3597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7196, + "grad_norm": 1.0699416399002075, + "kl": 0.4603011291474104, + "learning_rate": 1.8546312718702118e-07, + "loss": 0.046, + "num_tokens": 31095652.0, + "reward": 0.720458984375, + "reward_std": 0.0011549049522727728, + "rewards//mean": 0.720458984375, + "rewards//std": 0.031488388776779175, + "step": 3598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7198, + "grad_norm": 3.7458226680755615, + "kl": 1.03693563118577, + "learning_rate": 1.8521651344941463e-07, + "loss": 0.1037, + "num_tokens": 31104340.0, + "reward": 0.75714111328125, + "reward_std": 0.010511523112654686, + "rewards//mean": 0.75714111328125, + "rewards//std": 0.032841384410858154, + "step": 3599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.72, + "grad_norm": 3.8834872245788574, + "kl": 1.1918646320700645, + "learning_rate": 1.8497002650655885e-07, + "loss": 0.1192, + "num_tokens": 31112924.0, + "reward": 0.757568359375, + "reward_std": 0.009045414626598358, + "rewards//mean": 0.757568359375, + "rewards//std": 0.033939018845558167, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7202, + "grad_norm": 4.841434478759766, + "kl": 1.2399980742484331, + "learning_rate": 1.847236664577389e-07, + "loss": 0.124, + "num_tokens": 31121556.0, + "reward": 0.7427978515625, + "reward_std": 0.00931574311107397, + "rewards//mean": 0.7427978515625, + "rewards//std": 0.0204012431204319, + "step": 3601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7204, + "grad_norm": 7.134099960327148, + "kl": 2.078274179250002, + "learning_rate": 1.8447743340218818e-07, + "loss": 0.2078, + "num_tokens": 31130572.0, + "reward": 0.79095458984375, + "reward_std": 0.00828572642058134, + "rewards//mean": 0.79095458984375, + "rewards//std": 0.033925577998161316, + "step": 3602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7206, + "grad_norm": 3.0384371280670166, + "kl": 0.7858038395643234, + "learning_rate": 1.842313274390896e-07, + "loss": 0.0786, + "num_tokens": 31139204.0, + "reward": 0.75872802734375, + "reward_std": 0.002424883656203747, + "rewards//mean": 0.75872802734375, + "rewards//std": 0.021829580888152122, + "step": 3603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7208, + "grad_norm": 4.450641632080078, + "kl": 1.241953071206808, + "learning_rate": 1.8398534866757455e-07, + "loss": 0.1242, + "num_tokens": 31147876.0, + "reward": 0.7838134765625, + "reward_std": 0.010355833917856216, + "rewards//mean": 0.7838134765625, + "rewards//std": 0.034239865839481354, + "step": 3604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.721, + "grad_norm": 3.986569881439209, + "kl": 1.4360428992658854, + "learning_rate": 1.8373949718672344e-07, + "loss": 0.1436, + "num_tokens": 31156612.0, + "reward": 0.7557373046875, + "reward_std": 0.005488820839673281, + "rewards//mean": 0.7557373046875, + "rewards//std": 0.03421686962246895, + "step": 3605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7212, + "grad_norm": 6.947171211242676, + "kl": 2.427562654018402, + "learning_rate": 1.8349377309556486e-07, + "loss": 0.2428, + "num_tokens": 31165196.0, + "reward": 0.77081298828125, + "reward_std": 0.014342323876917362, + "rewards//mean": 0.77081298828125, + "rewards//std": 0.03191758692264557, + "step": 3606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7214, + "grad_norm": 3.037904977798462, + "kl": 1.481584157794714, + "learning_rate": 1.8324817649307668e-07, + "loss": 0.1482, + "num_tokens": 31173908.0, + "reward": 0.76544189453125, + "reward_std": 0.009519679471850395, + "rewards//mean": 0.76544189453125, + "rewards//std": 0.030721833929419518, + "step": 3607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7216, + "grad_norm": 2.3632469177246094, + "kl": 1.2183293346315622, + "learning_rate": 1.8300270747818526e-07, + "loss": 0.1218, + "num_tokens": 31182628.0, + "reward": 0.76763916015625, + "reward_std": 0.011327773332595825, + "rewards//mean": 0.76763916015625, + "rewards//std": 0.03533143922686577, + "step": 3608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7218, + "grad_norm": 1.8168559074401855, + "kl": 1.2174032609909773, + "learning_rate": 1.8275736614976517e-07, + "loss": 0.1217, + "num_tokens": 31191252.0, + "reward": 0.778564453125, + "reward_std": 0.011249836534261703, + "rewards//mean": 0.778564453125, + "rewards//std": 0.033355962485075, + "step": 3609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.722, + "grad_norm": 4.882945537567139, + "kl": 1.6730669103562832, + "learning_rate": 1.8251215260664006e-07, + "loss": 0.1673, + "num_tokens": 31199860.0, + "reward": 0.7564697265625, + "reward_std": 0.006939234212040901, + "rewards//mean": 0.7564697265625, + "rewards//std": 0.0221979022026062, + "step": 3610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7222, + "grad_norm": 4.814847469329834, + "kl": 0.9543901216238737, + "learning_rate": 1.8226706694758193e-07, + "loss": 0.0954, + "num_tokens": 31208532.0, + "reward": 0.7750244140625, + "reward_std": 0.006758289877325296, + "rewards//mean": 0.7750244140625, + "rewards//std": 0.03329668566584587, + "step": 3611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7224, + "grad_norm": 5.459836959838867, + "kl": 1.1569914650171995, + "learning_rate": 1.820221092713114e-07, + "loss": 0.1157, + "num_tokens": 31217156.0, + "reward": 0.7657470703125, + "reward_std": 0.00960256066173315, + "rewards//mean": 0.7657470703125, + "rewards//std": 0.027638617902994156, + "step": 3612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7226, + "grad_norm": 2.6592509746551514, + "kl": 1.1117970738559961, + "learning_rate": 1.8177727967649703e-07, + "loss": 0.1112, + "num_tokens": 31225852.0, + "reward": 0.76519775390625, + "reward_std": 0.006506867706775665, + "rewards//mean": 0.76519775390625, + "rewards//std": 0.021610742434859276, + "step": 3613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7228, + "grad_norm": 2.903900623321533, + "kl": 0.7259663175791502, + "learning_rate": 1.815325782617564e-07, + "loss": 0.0726, + "num_tokens": 31234452.0, + "reward": 0.7261962890625, + "reward_std": 0.004932370036840439, + "rewards//mean": 0.7261962890625, + "rewards//std": 0.02939462661743164, + "step": 3614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.723, + "grad_norm": 4.081010341644287, + "kl": 1.1607152055948973, + "learning_rate": 1.812880051256551e-07, + "loss": 0.1161, + "num_tokens": 31243124.0, + "reward": 0.76739501953125, + "reward_std": 0.007521071471273899, + "rewards//mean": 0.76739501953125, + "rewards//std": 0.02841603010892868, + "step": 3615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7232, + "grad_norm": 2.6553351879119873, + "kl": 0.7097238004207611, + "learning_rate": 1.810435603667075e-07, + "loss": 0.071, + "num_tokens": 31251788.0, + "reward": 0.73797607421875, + "reward_std": 0.006131630856543779, + "rewards//mean": 0.73797607421875, + "rewards//std": 0.04202727600932121, + "step": 3616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7234, + "grad_norm": 2.828874111175537, + "kl": 1.0383149795234203, + "learning_rate": 1.8079924408337537e-07, + "loss": 0.1038, + "num_tokens": 31260420.0, + "reward": 0.7276611328125, + "reward_std": 0.005276789888739586, + "rewards//mean": 0.7276611328125, + "rewards//std": 0.02270760014653206, + "step": 3617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7236, + "grad_norm": 2.3275647163391113, + "kl": 1.4320752043277025, + "learning_rate": 1.8055505637406958e-07, + "loss": 0.1432, + "num_tokens": 31269084.0, + "reward": 0.7178955078125, + "reward_std": 0.006786086596548557, + "rewards//mean": 0.7178955078125, + "rewards//std": 0.03217014670372009, + "step": 3618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7238, + "grad_norm": 3.2812867164611816, + "kl": 0.9148726854473352, + "learning_rate": 1.8031099733714889e-07, + "loss": 0.0915, + "num_tokens": 31277692.0, + "reward": 0.74798583984375, + "reward_std": 0.006311007309705019, + "rewards//mean": 0.74798583984375, + "rewards//std": 0.02657555602490902, + "step": 3619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.724, + "grad_norm": 3.608558177947998, + "kl": 1.4747534710913897, + "learning_rate": 1.800670670709204e-07, + "loss": 0.1475, + "num_tokens": 31286340.0, + "reward": 0.7711181640625, + "reward_std": 0.014344016090035439, + "rewards//mean": 0.7711181640625, + "rewards//std": 0.026763787493109703, + "step": 3620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7242, + "grad_norm": 0.9373873472213745, + "kl": 0.8879157714545727, + "learning_rate": 1.7982326567363886e-07, + "loss": 0.0888, + "num_tokens": 31294972.0, + "reward": 0.75830078125, + "reward_std": 0.001057920977473259, + "rewards//mean": 0.75830078125, + "rewards//std": 0.028860632330179214, + "step": 3621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7244, + "grad_norm": 6.161097049713135, + "kl": 1.4532230403274298, + "learning_rate": 1.7957959324350763e-07, + "loss": 0.1453, + "num_tokens": 31303580.0, + "reward": 0.75067138671875, + "reward_std": 0.010281732305884361, + "rewards//mean": 0.75067138671875, + "rewards//std": 0.027512013912200928, + "step": 3622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7246, + "grad_norm": 2.6001176834106445, + "kl": 2.217102773487568, + "learning_rate": 1.7933604987867813e-07, + "loss": 0.2217, + "num_tokens": 31312196.0, + "reward": 0.76751708984375, + "reward_std": 0.015863671898841858, + "rewards//mean": 0.76751708984375, + "rewards//std": 0.027830945327878, + "step": 3623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7248, + "grad_norm": 5.593306541442871, + "kl": 1.1891767289489508, + "learning_rate": 1.7909263567724914e-07, + "loss": 0.1189, + "num_tokens": 31320812.0, + "reward": 0.7554931640625, + "reward_std": 0.009153638035058975, + "rewards//mean": 0.7554931640625, + "rewards//std": 0.025312703102827072, + "step": 3624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.725, + "grad_norm": 2.9393351078033447, + "kl": 1.285300137475133, + "learning_rate": 1.788493507372682e-07, + "loss": 0.1285, + "num_tokens": 31329484.0, + "reward": 0.73040771484375, + "reward_std": 0.008501885458827019, + "rewards//mean": 0.73040771484375, + "rewards//std": 0.030469007790088654, + "step": 3625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7252, + "grad_norm": 2.2730495929718018, + "kl": 1.3680921904742718, + "learning_rate": 1.7860619515673032e-07, + "loss": 0.1368, + "num_tokens": 31338092.0, + "reward": 0.7666015625, + "reward_std": 0.009912711568176746, + "rewards//mean": 0.7666015625, + "rewards//std": 0.02257421426475048, + "step": 3626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7254, + "grad_norm": 8.12806510925293, + "kl": 2.511561468243599, + "learning_rate": 1.783631690335788e-07, + "loss": 0.2512, + "num_tokens": 31346828.0, + "reward": 0.7222900390625, + "reward_std": 0.0107989851385355, + "rewards//mean": 0.7222900390625, + "rewards//std": 0.038699448108673096, + "step": 3627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7256, + "grad_norm": 1.7022359371185303, + "kl": 1.4891007281839848, + "learning_rate": 1.7812027246570416e-07, + "loss": 0.1489, + "num_tokens": 31355364.0, + "reward": 0.78680419921875, + "reward_std": 0.011721480637788773, + "rewards//mean": 0.78680419921875, + "rewards//std": 0.027079440653324127, + "step": 3628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7258, + "grad_norm": 9.347336769104004, + "kl": 1.972281901165843, + "learning_rate": 1.7787750555094528e-07, + "loss": 0.1972, + "num_tokens": 31364044.0, + "reward": 0.76019287109375, + "reward_std": 0.00617905892431736, + "rewards//mean": 0.76019287109375, + "rewards//std": 0.03545205667614937, + "step": 3629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.726, + "grad_norm": 2.5284423828125, + "kl": 1.0939804501831532, + "learning_rate": 1.7763486838708856e-07, + "loss": 0.1094, + "num_tokens": 31372628.0, + "reward": 0.75811767578125, + "reward_std": 0.008955802768468857, + "rewards//mean": 0.75811767578125, + "rewards//std": 0.027920546010136604, + "step": 3630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7262, + "grad_norm": 1.3449280261993408, + "kl": 0.8773439694195986, + "learning_rate": 1.7739236107186857e-07, + "loss": 0.0877, + "num_tokens": 31381268.0, + "reward": 0.748291015625, + "reward_std": 0.0035182740539312363, + "rewards//mean": 0.748291015625, + "rewards//std": 0.028939202427864075, + "step": 3631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7264, + "grad_norm": 2.0236005783081055, + "kl": 0.8079567030072212, + "learning_rate": 1.7714998370296674e-07, + "loss": 0.0808, + "num_tokens": 31389948.0, + "reward": 0.7476806640625, + "reward_std": 0.0034381961449980736, + "rewards//mean": 0.7476806640625, + "rewards//std": 0.020631449297070503, + "step": 3632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7266, + "grad_norm": 2.3733088970184326, + "kl": 1.3246294688433409, + "learning_rate": 1.7690773637801292e-07, + "loss": 0.1325, + "num_tokens": 31398540.0, + "reward": 0.7386474609375, + "reward_std": 0.005122218281030655, + "rewards//mean": 0.7386474609375, + "rewards//std": 0.023945782333612442, + "step": 3633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7268, + "grad_norm": 2.766009569168091, + "kl": 1.3521664552390575, + "learning_rate": 1.7666561919458422e-07, + "loss": 0.1352, + "num_tokens": 31407108.0, + "reward": 0.7735595703125, + "reward_std": 0.00570264644920826, + "rewards//mean": 0.7735595703125, + "rewards//std": 0.018907049670815468, + "step": 3634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.727, + "grad_norm": 3.1496782302856445, + "kl": 1.4455826915800571, + "learning_rate": 1.7642363225020557e-07, + "loss": 0.1446, + "num_tokens": 31415740.0, + "reward": 0.74993896484375, + "reward_std": 0.009274263866245747, + "rewards//mean": 0.74993896484375, + "rewards//std": 0.031715378165245056, + "step": 3635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7272, + "grad_norm": 2.379915475845337, + "kl": 1.073655053973198, + "learning_rate": 1.7618177564234904e-07, + "loss": 0.1074, + "num_tokens": 31424380.0, + "reward": 0.741943359375, + "reward_std": 0.004987068008631468, + "rewards//mean": 0.741943359375, + "rewards//std": 0.03622423857450485, + "step": 3636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7274, + "grad_norm": 3.0966336727142334, + "kl": 2.114138212054968, + "learning_rate": 1.7594004946843454e-07, + "loss": 0.2114, + "num_tokens": 31433076.0, + "reward": 0.7879638671875, + "reward_std": 0.01663976162672043, + "rewards//mean": 0.7879638671875, + "rewards//std": 0.039552170783281326, + "step": 3637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7276, + "grad_norm": 4.011733531951904, + "kl": 1.9781376272439957, + "learning_rate": 1.7569845382582937e-07, + "loss": 0.1978, + "num_tokens": 31441804.0, + "reward": 0.7772216796875, + "reward_std": 0.009672818705439568, + "rewards//mean": 0.7772216796875, + "rewards//std": 0.028356635943055153, + "step": 3638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7278, + "grad_norm": 5.788450717926025, + "kl": 1.5142025910317898, + "learning_rate": 1.7545698881184833e-07, + "loss": 0.1514, + "num_tokens": 31450460.0, + "reward": 0.7564697265625, + "reward_std": 0.004948808345943689, + "rewards//mean": 0.7564697265625, + "rewards//std": 0.035829901695251465, + "step": 3639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.728, + "grad_norm": 8.6358003616333, + "kl": 2.7571348417550325, + "learning_rate": 1.752156545237533e-07, + "loss": 0.2757, + "num_tokens": 31459020.0, + "reward": 0.7762451171875, + "reward_std": 0.009794989600777626, + "rewards//mean": 0.7762451171875, + "rewards//std": 0.03443208709359169, + "step": 3640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7282, + "grad_norm": 2.176588535308838, + "kl": 1.8630645330995321, + "learning_rate": 1.7497445105875374e-07, + "loss": 0.1863, + "num_tokens": 31467604.0, + "reward": 0.77191162109375, + "reward_std": 0.010429752990603447, + "rewards//mean": 0.77191162109375, + "rewards//std": 0.02780155837535858, + "step": 3641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7284, + "grad_norm": 3.213690757751465, + "kl": 1.0404412765055895, + "learning_rate": 1.747333785140066e-07, + "loss": 0.104, + "num_tokens": 31476220.0, + "reward": 0.76422119140625, + "reward_std": 0.0056586372666060925, + "rewards//mean": 0.76422119140625, + "rewards//std": 0.025942588225007057, + "step": 3642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7286, + "grad_norm": 11.649092674255371, + "kl": 2.3898898791521788, + "learning_rate": 1.7449243698661552e-07, + "loss": 0.239, + "num_tokens": 31484892.0, + "reward": 0.769287109375, + "reward_std": 0.004640092141926289, + "rewards//mean": 0.769287109375, + "rewards//std": 0.041045140475034714, + "step": 3643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7288, + "grad_norm": 2.7153425216674805, + "kl": 1.7106557097285986, + "learning_rate": 1.742516265736319e-07, + "loss": 0.1711, + "num_tokens": 31493580.0, + "reward": 0.76513671875, + "reward_std": 0.011033926159143448, + "rewards//mean": 0.76513671875, + "rewards//std": 0.025812797248363495, + "step": 3644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.729, + "grad_norm": 1.0211427211761475, + "kl": 0.6192583702504635, + "learning_rate": 1.7401094737205414e-07, + "loss": 0.0619, + "num_tokens": 31502228.0, + "reward": 0.75335693359375, + "reward_std": 0.003569978289306164, + "rewards//mean": 0.75335693359375, + "rewards//std": 0.021518081426620483, + "step": 3645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7292, + "grad_norm": 2.5115840435028076, + "kl": 1.0067170597612858, + "learning_rate": 1.7377039947882798e-07, + "loss": 0.1007, + "num_tokens": 31510828.0, + "reward": 0.77301025390625, + "reward_std": 0.008436158299446106, + "rewards//mean": 0.77301025390625, + "rewards//std": 0.019289391115307808, + "step": 3646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7294, + "grad_norm": 4.884538650512695, + "kl": 2.0396976247429848, + "learning_rate": 1.735299829908457e-07, + "loss": 0.204, + "num_tokens": 31519612.0, + "reward": 0.76568603515625, + "reward_std": 0.015554554760456085, + "rewards//mean": 0.76568603515625, + "rewards//std": 0.03502505645155907, + "step": 3647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7296, + "grad_norm": 2.6474764347076416, + "kl": 1.621567154303193, + "learning_rate": 1.7328969800494726e-07, + "loss": 0.1622, + "num_tokens": 31528188.0, + "reward": 0.74609375, + "reward_std": 0.009167318232357502, + "rewards//mean": 0.74609375, + "rewards//std": 0.032644957304000854, + "step": 3648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7298, + "grad_norm": 2.2285194396972656, + "kl": 1.144742975011468, + "learning_rate": 1.7304954461791937e-07, + "loss": 0.1145, + "num_tokens": 31536884.0, + "reward": 0.74322509765625, + "reward_std": 0.00591643713414669, + "rewards//mean": 0.74322509765625, + "rewards//std": 0.021261893212795258, + "step": 3649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.73, + "grad_norm": 7.985546112060547, + "kl": 2.292952636256814, + "learning_rate": 1.7280952292649598e-07, + "loss": 0.2293, + "num_tokens": 31545500.0, + "reward": 0.75323486328125, + "reward_std": 0.009691108018159866, + "rewards//mean": 0.75323486328125, + "rewards//std": 0.033076539635658264, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7302, + "grad_norm": 1.637343168258667, + "kl": 0.8872905988246202, + "learning_rate": 1.725696330273575e-07, + "loss": 0.0887, + "num_tokens": 31554084.0, + "reward": 0.7828369140625, + "reward_std": 0.004649472888559103, + "rewards//mean": 0.7828369140625, + "rewards//std": 0.02492702752351761, + "step": 3651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7304, + "grad_norm": 1.4370688199996948, + "kl": 0.5872087348252535, + "learning_rate": 1.7232987501713164e-07, + "loss": 0.0587, + "num_tokens": 31562612.0, + "reward": 0.77587890625, + "reward_std": 0.0023118299432098866, + "rewards//mean": 0.77587890625, + "rewards//std": 0.02980198711156845, + "step": 3652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7306, + "grad_norm": 1.056073784828186, + "kl": 1.43581104837358, + "learning_rate": 1.7209024899239293e-07, + "loss": 0.1436, + "num_tokens": 31571340.0, + "reward": 0.751708984375, + "reward_std": 0.008612223900854588, + "rewards//mean": 0.751708984375, + "rewards//std": 0.0319010429084301, + "step": 3653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7308, + "grad_norm": 4.035719394683838, + "kl": 1.658320663496852, + "learning_rate": 1.718507550496629e-07, + "loss": 0.1658, + "num_tokens": 31579972.0, + "reward": 0.742919921875, + "reward_std": 0.008039873093366623, + "rewards//mean": 0.742919921875, + "rewards//std": 0.02921408787369728, + "step": 3654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.731, + "grad_norm": 4.3763041496276855, + "kl": 1.6967201046645641, + "learning_rate": 1.716113932854093e-07, + "loss": 0.1697, + "num_tokens": 31588604.0, + "reward": 0.7930908203125, + "reward_std": 0.010002600029110909, + "rewards//mean": 0.7930908203125, + "rewards//std": 0.0270271934568882, + "step": 3655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7312, + "grad_norm": 3.0079610347747803, + "kl": 1.6404349710792303, + "learning_rate": 1.7137216379604724e-07, + "loss": 0.164, + "num_tokens": 31597380.0, + "reward": 0.7376708984375, + "reward_std": 0.010529998689889908, + "rewards//mean": 0.7376708984375, + "rewards//std": 0.03221340849995613, + "step": 3656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7314, + "grad_norm": 1.815284252166748, + "kl": 2.1731308959424496, + "learning_rate": 1.7113306667793847e-07, + "loss": 0.2173, + "num_tokens": 31606036.0, + "reward": 0.7520751953125, + "reward_std": 0.016090426594018936, + "rewards//mean": 0.7520751953125, + "rewards//std": 0.036030445247888565, + "step": 3657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7316, + "grad_norm": 5.942960739135742, + "kl": 2.4793430976569653, + "learning_rate": 1.708941020273909e-07, + "loss": 0.2479, + "num_tokens": 31614780.0, + "reward": 0.76715087890625, + "reward_std": 0.017787551507353783, + "rewards//mean": 0.76715087890625, + "rewards//std": 0.029745997861027718, + "step": 3658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7318, + "grad_norm": 4.207793235778809, + "kl": 2.15258084051311, + "learning_rate": 1.7065526994065972e-07, + "loss": 0.2153, + "num_tokens": 31623468.0, + "reward": 0.75689697265625, + "reward_std": 0.0133521044626832, + "rewards//mean": 0.75689697265625, + "rewards//std": 0.02610429935157299, + "step": 3659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.732, + "grad_norm": 3.06423282623291, + "kl": 1.1920014042407274, + "learning_rate": 1.704165705139464e-07, + "loss": 0.1192, + "num_tokens": 31632100.0, + "reward": 0.74169921875, + "reward_std": 0.006926693022251129, + "rewards//mean": 0.74169921875, + "rewards//std": 0.0368383415043354, + "step": 3660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7322, + "grad_norm": 2.0276918411254883, + "kl": 1.828504540026188, + "learning_rate": 1.7017800384339924e-07, + "loss": 0.1829, + "num_tokens": 31640796.0, + "reward": 0.7330322265625, + "reward_std": 0.004663439467549324, + "rewards//mean": 0.7330322265625, + "rewards//std": 0.026145800948143005, + "step": 3661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7324, + "grad_norm": 5.994820594787598, + "kl": 1.2521796878427267, + "learning_rate": 1.6993957002511257e-07, + "loss": 0.1252, + "num_tokens": 31649428.0, + "reward": 0.7479248046875, + "reward_std": 0.004662121646106243, + "rewards//mean": 0.7479248046875, + "rewards//std": 0.030369292944669724, + "step": 3662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7326, + "grad_norm": 17.980852127075195, + "kl": 3.0310589987784624, + "learning_rate": 1.6970126915512756e-07, + "loss": 0.3031, + "num_tokens": 31657988.0, + "reward": 0.7491455078125, + "reward_std": 0.008295792154967785, + "rewards//mean": 0.7491455078125, + "rewards//std": 0.029663249850273132, + "step": 3663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7328, + "grad_norm": 1.5513654947280884, + "kl": 1.6581521946936846, + "learning_rate": 1.6946310132943187e-07, + "loss": 0.1658, + "num_tokens": 31666524.0, + "reward": 0.73590087890625, + "reward_std": 0.005756876431405544, + "rewards//mean": 0.73590087890625, + "rewards//std": 0.027202701196074486, + "step": 3664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.733, + "grad_norm": 6.590263366699219, + "kl": 1.8775539938360453, + "learning_rate": 1.692250666439596e-07, + "loss": 0.1878, + "num_tokens": 31675252.0, + "reward": 0.759521484375, + "reward_std": 0.00809670053422451, + "rewards//mean": 0.759521484375, + "rewards//std": 0.02408854104578495, + "step": 3665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7332, + "grad_norm": 8.969566345214844, + "kl": 1.6498516704887152, + "learning_rate": 1.6898716519459072e-07, + "loss": 0.165, + "num_tokens": 31683852.0, + "reward": 0.75994873046875, + "reward_std": 0.008494159206748009, + "rewards//mean": 0.75994873046875, + "rewards//std": 0.03990248590707779, + "step": 3666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7334, + "grad_norm": 1.2393653392791748, + "kl": 1.1085173059254885, + "learning_rate": 1.6874939707715214e-07, + "loss": 0.1109, + "num_tokens": 31692452.0, + "reward": 0.75177001953125, + "reward_std": 0.004971345420926809, + "rewards//mean": 0.75177001953125, + "rewards//std": 0.026565872132778168, + "step": 3667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7336, + "grad_norm": 7.293539047241211, + "kl": 1.908059049397707, + "learning_rate": 1.6851176238741683e-07, + "loss": 0.1908, + "num_tokens": 31701188.0, + "reward": 0.770263671875, + "reward_std": 0.006942110136151314, + "rewards//mean": 0.770263671875, + "rewards//std": 0.028227422386407852, + "step": 3668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7338, + "grad_norm": 4.054732799530029, + "kl": 1.4271561726927757, + "learning_rate": 1.6827426122110412e-07, + "loss": 0.1427, + "num_tokens": 31709900.0, + "reward": 0.78302001953125, + "reward_std": 0.013728762976825237, + "rewards//mean": 0.78302001953125, + "rewards//std": 0.022998636588454247, + "step": 3669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.734, + "grad_norm": 2.69773530960083, + "kl": 0.976861085742712, + "learning_rate": 1.6803689367387918e-07, + "loss": 0.0977, + "num_tokens": 31718500.0, + "reward": 0.7744140625, + "reward_std": 0.0066927168518304825, + "rewards//mean": 0.7744140625, + "rewards//std": 0.02051703818142414, + "step": 3670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7342, + "grad_norm": 6.958800792694092, + "kl": 3.2058827076107264, + "learning_rate": 1.6779965984135374e-07, + "loss": 0.3206, + "num_tokens": 31727252.0, + "reward": 0.81378173828125, + "reward_std": 0.016468260437250137, + "rewards//mean": 0.81378173828125, + "rewards//std": 0.029504306614398956, + "step": 3671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7344, + "grad_norm": 1.3585416078567505, + "kl": 1.3546293526887894, + "learning_rate": 1.675625598190858e-07, + "loss": 0.1355, + "num_tokens": 31735940.0, + "reward": 0.769287109375, + "reward_std": 0.010472102090716362, + "rewards//mean": 0.769287109375, + "rewards//std": 0.032315924763679504, + "step": 3672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7346, + "grad_norm": 2.1942405700683594, + "kl": 1.010774439200759, + "learning_rate": 1.6732559370257882e-07, + "loss": 0.1011, + "num_tokens": 31744548.0, + "reward": 0.77740478515625, + "reward_std": 0.011380745097994804, + "rewards//mean": 0.77740478515625, + "rewards//std": 0.03140842542052269, + "step": 3673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7348, + "grad_norm": 3.3301074504852295, + "kl": 1.1166346222162247, + "learning_rate": 1.670887615872829e-07, + "loss": 0.1117, + "num_tokens": 31753228.0, + "reward": 0.78302001953125, + "reward_std": 0.014034217223525047, + "rewards//mean": 0.78302001953125, + "rewards//std": 0.030804501846432686, + "step": 3674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.735, + "grad_norm": 2.179910182952881, + "kl": 1.0851857513189316, + "learning_rate": 1.6685206356859398e-07, + "loss": 0.1085, + "num_tokens": 31761836.0, + "reward": 0.77215576171875, + "reward_std": 0.0072040678933262825, + "rewards//mean": 0.77215576171875, + "rewards//std": 0.02184413932263851, + "step": 3675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7352, + "grad_norm": 1.5123789310455322, + "kl": 0.6257348749786615, + "learning_rate": 1.6661549974185424e-07, + "loss": 0.0626, + "num_tokens": 31770556.0, + "reward": 0.80712890625, + "reward_std": 0.0, + "rewards//mean": 0.80712890625, + "rewards//std": 0.016105882823467255, + "step": 3676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7354, + "grad_norm": 0.9837465882301331, + "kl": 1.1552923452109098, + "learning_rate": 1.6637907020235114e-07, + "loss": 0.1155, + "num_tokens": 31779100.0, + "reward": 0.7333984375, + "reward_std": 0.004011887591332197, + "rewards//mean": 0.7333984375, + "rewards//std": 0.02716156095266342, + "step": 3677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7356, + "grad_norm": 2.0074195861816406, + "kl": 1.0637095719575882, + "learning_rate": 1.6614277504531866e-07, + "loss": 0.1064, + "num_tokens": 31787756.0, + "reward": 0.75811767578125, + "reward_std": 0.00644956948235631, + "rewards//mean": 0.75811767578125, + "rewards//std": 0.026331210508942604, + "step": 3678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7358, + "grad_norm": 3.1956799030303955, + "kl": 1.3545568063855171, + "learning_rate": 1.659066143659366e-07, + "loss": 0.1355, + "num_tokens": 31796444.0, + "reward": 0.72607421875, + "reward_std": 0.010137047618627548, + "rewards//mean": 0.72607421875, + "rewards//std": 0.03782450407743454, + "step": 3679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.736, + "grad_norm": 2.8320846557617188, + "kl": 1.0130792614072561, + "learning_rate": 1.6567058825933022e-07, + "loss": 0.1013, + "num_tokens": 31804996.0, + "reward": 0.77423095703125, + "reward_std": 0.009003892540931702, + "rewards//mean": 0.77423095703125, + "rewards//std": 0.02835843898355961, + "step": 3680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7362, + "grad_norm": 6.376828193664551, + "kl": 1.175009809434414, + "learning_rate": 1.6543469682057104e-07, + "loss": 0.1175, + "num_tokens": 31813652.0, + "reward": 0.7718505859375, + "reward_std": 0.005941000301390886, + "rewards//mean": 0.7718505859375, + "rewards//std": 0.0312555693089962, + "step": 3681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7364, + "grad_norm": 2.740814208984375, + "kl": 1.5440131891518831, + "learning_rate": 1.6519894014467578e-07, + "loss": 0.1544, + "num_tokens": 31822228.0, + "reward": 0.78302001953125, + "reward_std": 0.008817780762910843, + "rewards//mean": 0.78302001953125, + "rewards//std": 0.03445642441511154, + "step": 3682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7366, + "grad_norm": 2.605287551879883, + "kl": 2.241225216537714, + "learning_rate": 1.6496331832660742e-07, + "loss": 0.2241, + "num_tokens": 31830780.0, + "reward": 0.77587890625, + "reward_std": 0.014115373603999615, + "rewards//mean": 0.77587890625, + "rewards//std": 0.02761697582900524, + "step": 3683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7368, + "grad_norm": 2.0110344886779785, + "kl": 1.3874699845910072, + "learning_rate": 1.6472783146127438e-07, + "loss": 0.1387, + "num_tokens": 31839356.0, + "reward": 0.73779296875, + "reward_std": 0.006259104236960411, + "rewards//mean": 0.73779296875, + "rewards//std": 0.0305167268961668, + "step": 3684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.737, + "grad_norm": 1.8049010038375854, + "kl": 1.5215303003787994, + "learning_rate": 1.644924796435309e-07, + "loss": 0.1522, + "num_tokens": 31848020.0, + "reward": 0.77197265625, + "reward_std": 0.012044377624988556, + "rewards//mean": 0.77197265625, + "rewards//std": 0.043811630457639694, + "step": 3685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7372, + "grad_norm": 5.865328311920166, + "kl": 2.059690000489354, + "learning_rate": 1.6425726296817632e-07, + "loss": 0.206, + "num_tokens": 31856660.0, + "reward": 0.74420166015625, + "reward_std": 0.0115470876917243, + "rewards//mean": 0.74420166015625, + "rewards//std": 0.043252427130937576, + "step": 3686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7374, + "grad_norm": 1.109811782836914, + "kl": 1.4801658038049936, + "learning_rate": 1.6402218152995607e-07, + "loss": 0.148, + "num_tokens": 31865372.0, + "reward": 0.7398681640625, + "reward_std": 0.009936993941664696, + "rewards//mean": 0.7398681640625, + "rewards//std": 0.037678346037864685, + "step": 3687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7376, + "grad_norm": 11.611242294311523, + "kl": 2.4189673587679863, + "learning_rate": 1.637872354235611e-07, + "loss": 0.2419, + "num_tokens": 31873972.0, + "reward": 0.75164794921875, + "reward_std": 0.012917518615722656, + "rewards//mean": 0.75164794921875, + "rewards//std": 0.02675551362335682, + "step": 3688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7378, + "grad_norm": 1.1606810092926025, + "kl": 0.6194041315466166, + "learning_rate": 1.6355242474362728e-07, + "loss": 0.0619, + "num_tokens": 31882724.0, + "reward": 0.76806640625, + "reward_std": 0.0037587760016322136, + "rewards//mean": 0.76806640625, + "rewards//std": 0.02663480117917061, + "step": 3689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.738, + "grad_norm": 5.6923828125, + "kl": 1.5565295461565256, + "learning_rate": 1.633177495847366e-07, + "loss": 0.1557, + "num_tokens": 31891316.0, + "reward": 0.77105712890625, + "reward_std": 0.006274091079831123, + "rewards//mean": 0.77105712890625, + "rewards//std": 0.017574572935700417, + "step": 3690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7382, + "grad_norm": 3.090585231781006, + "kl": 1.311974573880434, + "learning_rate": 1.6308321004141607e-07, + "loss": 0.1312, + "num_tokens": 31899980.0, + "reward": 0.74761962890625, + "reward_std": 0.011259451508522034, + "rewards//mean": 0.74761962890625, + "rewards//std": 0.028802813962101936, + "step": 3691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7384, + "grad_norm": 3.129936695098877, + "kl": 1.3225679136812687, + "learning_rate": 1.6284880620813846e-07, + "loss": 0.1323, + "num_tokens": 31908548.0, + "reward": 0.7568359375, + "reward_std": 0.004199131391942501, + "rewards//mean": 0.7568359375, + "rewards//std": 0.028527216985821724, + "step": 3692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7386, + "grad_norm": 2.2138671875, + "kl": 1.1197083182632923, + "learning_rate": 1.6261453817932119e-07, + "loss": 0.112, + "num_tokens": 31917164.0, + "reward": 0.77154541015625, + "reward_std": 0.009835449978709221, + "rewards//mean": 0.77154541015625, + "rewards//std": 0.025985730811953545, + "step": 3693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7388, + "grad_norm": 1.6211321353912354, + "kl": 1.2124909963458776, + "learning_rate": 1.6238040604932757e-07, + "loss": 0.1212, + "num_tokens": 31925756.0, + "reward": 0.7718505859375, + "reward_std": 0.0068723405711352825, + "rewards//mean": 0.7718505859375, + "rewards//std": 0.03189985826611519, + "step": 3694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.739, + "grad_norm": 4.031853199005127, + "kl": 2.6197838876396418, + "learning_rate": 1.6214640991246609e-07, + "loss": 0.262, + "num_tokens": 31934508.0, + "reward": 0.7364501953125, + "reward_std": 0.01330025214701891, + "rewards//mean": 0.7364501953125, + "rewards//std": 0.035969894379377365, + "step": 3695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7392, + "grad_norm": 5.185155868530273, + "kl": 1.325111623853445, + "learning_rate": 1.6191254986299042e-07, + "loss": 0.1325, + "num_tokens": 31943132.0, + "reward": 0.76507568359375, + "reward_std": 0.00578394066542387, + "rewards//mean": 0.76507568359375, + "rewards//std": 0.027873337268829346, + "step": 3696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7394, + "grad_norm": 4.835397243499756, + "kl": 1.891045119613409, + "learning_rate": 1.6167882599509902e-07, + "loss": 0.1891, + "num_tokens": 31951716.0, + "reward": 0.74676513671875, + "reward_std": 0.010816682130098343, + "rewards//mean": 0.74676513671875, + "rewards//std": 0.03186062350869179, + "step": 3697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7396, + "grad_norm": 1.265910267829895, + "kl": 1.073219794780016, + "learning_rate": 1.614452384029361e-07, + "loss": 0.1073, + "num_tokens": 31960396.0, + "reward": 0.7493896484375, + "reward_std": 0.007787371054291725, + "rewards//mean": 0.7493896484375, + "rewards//std": 0.022393440827727318, + "step": 3698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7398, + "grad_norm": 10.614566802978516, + "kl": 1.5748443938791752, + "learning_rate": 1.612117871805907e-07, + "loss": 0.1575, + "num_tokens": 31969132.0, + "reward": 0.76715087890625, + "reward_std": 0.008154649287462234, + "rewards//mean": 0.76715087890625, + "rewards//std": 0.03335408866405487, + "step": 3699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.74, + "grad_norm": 4.227376937866211, + "kl": 1.4488353449851274, + "learning_rate": 1.60978472422097e-07, + "loss": 0.1449, + "num_tokens": 31977844.0, + "reward": 0.75177001953125, + "reward_std": 0.016136107966303825, + "rewards//mean": 0.75177001953125, + "rewards//std": 0.02925596386194229, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7402, + "grad_norm": 1.7726037502288818, + "kl": 1.2976163625717163, + "learning_rate": 1.6074529422143396e-07, + "loss": 0.1298, + "num_tokens": 31986564.0, + "reward": 0.7877197265625, + "reward_std": 0.009573964402079582, + "rewards//mean": 0.7877197265625, + "rewards//std": 0.038287315517663956, + "step": 3701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7404, + "grad_norm": 8.24777889251709, + "kl": 2.561319265514612, + "learning_rate": 1.6051225267252583e-07, + "loss": 0.2561, + "num_tokens": 31995468.0, + "reward": 0.73828125, + "reward_std": 0.0102771557867527, + "rewards//mean": 0.73828125, + "rewards//std": 0.03663065284490585, + "step": 3702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7406, + "grad_norm": 2.316319227218628, + "kl": 1.3027221914380789, + "learning_rate": 1.6027934786924185e-07, + "loss": 0.1303, + "num_tokens": 32004004.0, + "reward": 0.7821044921875, + "reward_std": 0.006560072302818298, + "rewards//mean": 0.7821044921875, + "rewards//std": 0.02287762239575386, + "step": 3703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7408, + "grad_norm": 5.555315017700195, + "kl": 1.3113633301109076, + "learning_rate": 1.6004657990539578e-07, + "loss": 0.1311, + "num_tokens": 32012644.0, + "reward": 0.7535400390625, + "reward_std": 0.010316809639334679, + "rewards//mean": 0.7535400390625, + "rewards//std": 0.02559814602136612, + "step": 3704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.741, + "grad_norm": 3.780252456665039, + "kl": 1.5089221242815256, + "learning_rate": 1.598139488747467e-07, + "loss": 0.1509, + "num_tokens": 32021260.0, + "reward": 0.74847412109375, + "reward_std": 0.008789447136223316, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.020107053220272064, + "step": 3705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7412, + "grad_norm": 2.3137834072113037, + "kl": 1.0355698578059673, + "learning_rate": 1.5958145487099827e-07, + "loss": 0.1036, + "num_tokens": 32029812.0, + "reward": 0.7568359375, + "reward_std": 0.007643547840416431, + "rewards//mean": 0.7568359375, + "rewards//std": 0.02273458242416382, + "step": 3706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7414, + "grad_norm": 0.29372066259384155, + "kl": 0.4471759982407093, + "learning_rate": 1.5934909798779933e-07, + "loss": 0.0447, + "num_tokens": 32038404.0, + "reward": 0.774658203125, + "reward_std": 0.0005712973070330918, + "rewards//mean": 0.774658203125, + "rewards//std": 0.026565372943878174, + "step": 3707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7416, + "grad_norm": 0.8981776237487793, + "kl": 0.6584113966673613, + "learning_rate": 1.5911687831874278e-07, + "loss": 0.0658, + "num_tokens": 32047020.0, + "reward": 0.7696533203125, + "reward_std": 0.0037979367189109325, + "rewards//mean": 0.7696533203125, + "rewards//std": 0.02351970598101616, + "step": 3708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7418, + "grad_norm": 2.649975299835205, + "kl": 0.7335557471960783, + "learning_rate": 1.5888479595736694e-07, + "loss": 0.0734, + "num_tokens": 32055612.0, + "reward": 0.74365234375, + "reward_std": 0.006750732194632292, + "rewards//mean": 0.74365234375, + "rewards//std": 0.025869034230709076, + "step": 3709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.742, + "grad_norm": 4.865139484405518, + "kl": 0.830951763316989, + "learning_rate": 1.5865285099715442e-07, + "loss": 0.0831, + "num_tokens": 32064284.0, + "reward": 0.774658203125, + "reward_std": 0.006969161797314882, + "rewards//mean": 0.774658203125, + "rewards//std": 0.022422151640057564, + "step": 3710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7422, + "grad_norm": 2.07719349861145, + "kl": 1.7714163288474083, + "learning_rate": 1.5842104353153285e-07, + "loss": 0.1771, + "num_tokens": 32072844.0, + "reward": 0.7574462890625, + "reward_std": 0.016687996685504913, + "rewards//mean": 0.7574462890625, + "rewards//std": 0.03163299709558487, + "step": 3711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7424, + "grad_norm": 1.4347649812698364, + "kl": 0.8745063710957766, + "learning_rate": 1.5818937365387396e-07, + "loss": 0.0875, + "num_tokens": 32081572.0, + "reward": 0.7723388671875, + "reward_std": 0.006130536086857319, + "rewards//mean": 0.7723388671875, + "rewards//std": 0.030595744028687477, + "step": 3712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7426, + "grad_norm": 2.32271146774292, + "kl": 1.3971271812915802, + "learning_rate": 1.5795784145749453e-07, + "loss": 0.1397, + "num_tokens": 32090204.0, + "reward": 0.73431396484375, + "reward_std": 0.008881470188498497, + "rewards//mean": 0.73431396484375, + "rewards//std": 0.023638075217604637, + "step": 3713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7428, + "grad_norm": 9.08750057220459, + "kl": 1.466499213129282, + "learning_rate": 1.5772644703565564e-07, + "loss": 0.1466, + "num_tokens": 32098948.0, + "reward": 0.762939453125, + "reward_std": 0.003680011723190546, + "rewards//mean": 0.762939453125, + "rewards//std": 0.036649659276008606, + "step": 3714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.743, + "grad_norm": 10.255487442016602, + "kl": 2.2387324273586273, + "learning_rate": 1.5749519048156306e-07, + "loss": 0.2239, + "num_tokens": 32107692.0, + "reward": 0.7783203125, + "reward_std": 0.016130223870277405, + "rewards//mean": 0.7783203125, + "rewards//std": 0.0388324074447155, + "step": 3715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7432, + "grad_norm": 2.949744701385498, + "kl": 0.8531262949109077, + "learning_rate": 1.5726407188836672e-07, + "loss": 0.0853, + "num_tokens": 32116284.0, + "reward": 0.7484130859375, + "reward_std": 0.006560072302818298, + "rewards//mean": 0.7484130859375, + "rewards//std": 0.029803765937685966, + "step": 3716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7434, + "grad_norm": 3.0127406120300293, + "kl": 1.055157609283924, + "learning_rate": 1.5703309134916116e-07, + "loss": 0.1055, + "num_tokens": 32124932.0, + "reward": 0.745361328125, + "reward_std": 0.008582555688917637, + "rewards//mean": 0.745361328125, + "rewards//std": 0.027864713221788406, + "step": 3717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7436, + "grad_norm": 2.54862904548645, + "kl": 1.1906484961509705, + "learning_rate": 1.5680224895698558e-07, + "loss": 0.1191, + "num_tokens": 32133580.0, + "reward": 0.767333984375, + "reward_std": 0.009091516956686974, + "rewards//mean": 0.767333984375, + "rewards//std": 0.036029815673828125, + "step": 3718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7438, + "grad_norm": 5.062285423278809, + "kl": 1.829509899020195, + "learning_rate": 1.5657154480482293e-07, + "loss": 0.183, + "num_tokens": 32142308.0, + "reward": 0.76788330078125, + "reward_std": 0.010600619949400425, + "rewards//mean": 0.76788330078125, + "rewards//std": 0.028517067432403564, + "step": 3719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.744, + "grad_norm": 0.943900465965271, + "kl": 1.0451509933918715, + "learning_rate": 1.5634097898560096e-07, + "loss": 0.1045, + "num_tokens": 32150964.0, + "reward": 0.76861572265625, + "reward_std": 0.006799315568059683, + "rewards//mean": 0.76861572265625, + "rewards//std": 0.034356553107500076, + "step": 3720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7442, + "grad_norm": 1.3675764799118042, + "kl": 1.3628126215189695, + "learning_rate": 1.561105515921915e-07, + "loss": 0.1363, + "num_tokens": 32159548.0, + "reward": 0.7509765625, + "reward_std": 0.008399026468396187, + "rewards//mean": 0.7509765625, + "rewards//std": 0.026438569650053978, + "step": 3721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7444, + "grad_norm": 1.4864646196365356, + "kl": 0.8481197394430637, + "learning_rate": 1.5588026271741095e-07, + "loss": 0.0848, + "num_tokens": 32168140.0, + "reward": 0.7501220703125, + "reward_std": 0.0037979367189109325, + "rewards//mean": 0.7501220703125, + "rewards//std": 0.02101816236972809, + "step": 3722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7446, + "grad_norm": 3.445319652557373, + "kl": 0.8973574265837669, + "learning_rate": 1.5565011245401927e-07, + "loss": 0.0897, + "num_tokens": 32176796.0, + "reward": 0.77716064453125, + "reward_std": 0.0035026399418711662, + "rewards//mean": 0.77716064453125, + "rewards//std": 0.023800816386938095, + "step": 3723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7448, + "grad_norm": 1.8187365531921387, + "kl": 1.4890708606690168, + "learning_rate": 1.5542010089472108e-07, + "loss": 0.1489, + "num_tokens": 32185468.0, + "reward": 0.71710205078125, + "reward_std": 0.006399991922080517, + "rewards//mean": 0.71710205078125, + "rewards//std": 0.03962763771414757, + "step": 3724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.745, + "grad_norm": 1.7749907970428467, + "kl": 1.435535229742527, + "learning_rate": 1.551902281321651e-07, + "loss": 0.1436, + "num_tokens": 32194116.0, + "reward": 0.7626953125, + "reward_std": 0.007539596874266863, + "rewards//mean": 0.7626953125, + "rewards//std": 0.03425246477127075, + "step": 3725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7452, + "grad_norm": 3.9980294704437256, + "kl": 1.5127747152000666, + "learning_rate": 1.5496049425894408e-07, + "loss": 0.1513, + "num_tokens": 32202884.0, + "reward": 0.74969482421875, + "reward_std": 0.010705679655075073, + "rewards//mean": 0.74969482421875, + "rewards//std": 0.032624952495098114, + "step": 3726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7454, + "grad_norm": 1.7766250371932983, + "kl": 1.2001825720071793, + "learning_rate": 1.5473089936759458e-07, + "loss": 0.12, + "num_tokens": 32211500.0, + "reward": 0.79119873046875, + "reward_std": 0.007803767919540405, + "rewards//mean": 0.79119873046875, + "rewards//std": 0.025064727291464806, + "step": 3727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7456, + "grad_norm": 0.28089919686317444, + "kl": 0.4322301298379898, + "learning_rate": 1.5450144355059752e-07, + "loss": 0.0432, + "num_tokens": 32220188.0, + "reward": 0.77899169921875, + "reward_std": 0.0005893231718800962, + "rewards//mean": 0.77899169921875, + "rewards//std": 0.02076118439435959, + "step": 3728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7458, + "grad_norm": 1.611114263534546, + "kl": 1.5209173560142517, + "learning_rate": 1.542721269003777e-07, + "loss": 0.1521, + "num_tokens": 32228764.0, + "reward": 0.71160888671875, + "reward_std": 0.008977998048067093, + "rewards//mean": 0.71160888671875, + "rewards//std": 0.03396526724100113, + "step": 3729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.746, + "grad_norm": 3.9468460083007812, + "kl": 1.039316175505519, + "learning_rate": 1.5404294950930397e-07, + "loss": 0.1039, + "num_tokens": 32237388.0, + "reward": 0.74810791015625, + "reward_std": 0.004884968977421522, + "rewards//mean": 0.74810791015625, + "rewards//std": 0.028535639867186546, + "step": 3730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7462, + "grad_norm": 7.642608642578125, + "kl": 1.7990350909531116, + "learning_rate": 1.5381391146968863e-07, + "loss": 0.1799, + "num_tokens": 32246036.0, + "reward": 0.74920654296875, + "reward_std": 0.00919945165514946, + "rewards//mean": 0.74920654296875, + "rewards//std": 0.03500214219093323, + "step": 3731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7464, + "grad_norm": 1.88874351978302, + "kl": 1.4023763556033373, + "learning_rate": 1.535850128737884e-07, + "loss": 0.1402, + "num_tokens": 32254660.0, + "reward": 0.7772216796875, + "reward_std": 0.00822986289858818, + "rewards//mean": 0.7772216796875, + "rewards//std": 0.025209631770849228, + "step": 3732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7466, + "grad_norm": 1.7771401405334473, + "kl": 1.4779158141463995, + "learning_rate": 1.5335625381380364e-07, + "loss": 0.1478, + "num_tokens": 32263300.0, + "reward": 0.775146484375, + "reward_std": 0.0053057437762618065, + "rewards//mean": 0.775146484375, + "rewards//std": 0.03151914104819298, + "step": 3733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7468, + "grad_norm": 3.3674259185791016, + "kl": 0.8208715356886387, + "learning_rate": 1.5312763438187826e-07, + "loss": 0.0821, + "num_tokens": 32271924.0, + "reward": 0.74688720703125, + "reward_std": 0.0035788509994745255, + "rewards//mean": 0.74688720703125, + "rewards//std": 0.026431052014231682, + "step": 3734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.747, + "grad_norm": 3.3829214572906494, + "kl": 1.0841650869697332, + "learning_rate": 1.5289915467010029e-07, + "loss": 0.1084, + "num_tokens": 32280556.0, + "reward": 0.7816162109375, + "reward_std": 0.007751616649329662, + "rewards//mean": 0.7816162109375, + "rewards//std": 0.028575729578733444, + "step": 3735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7472, + "grad_norm": 2.419546365737915, + "kl": 1.1570704635232687, + "learning_rate": 1.5267081477050131e-07, + "loss": 0.1157, + "num_tokens": 32289300.0, + "reward": 0.7840576171875, + "reward_std": 0.009536270052194595, + "rewards//mean": 0.7840576171875, + "rewards//std": 0.03668000176548958, + "step": 3736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7474, + "grad_norm": 2.256959915161133, + "kl": 1.394791379570961, + "learning_rate": 1.5244261477505676e-07, + "loss": 0.1395, + "num_tokens": 32297980.0, + "reward": 0.7347412109375, + "reward_std": 0.007939008064568043, + "rewards//mean": 0.7347412109375, + "rewards//std": 0.020806794986128807, + "step": 3737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7476, + "grad_norm": 1.2384107112884521, + "kl": 0.8838770017027855, + "learning_rate": 1.5221455477568523e-07, + "loss": 0.0884, + "num_tokens": 32306524.0, + "reward": 0.77008056640625, + "reward_std": 0.005022465251386166, + "rewards//mean": 0.77008056640625, + "rewards//std": 0.01949857361614704, + "step": 3738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7478, + "grad_norm": 5.100124835968018, + "kl": 1.9634501803666353, + "learning_rate": 1.5198663486424944e-07, + "loss": 0.1963, + "num_tokens": 32315124.0, + "reward": 0.7679443359375, + "reward_std": 0.008668426424264908, + "rewards//mean": 0.7679443359375, + "rewards//std": 0.040485721081495285, + "step": 3739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.748, + "grad_norm": 4.079394817352295, + "kl": 0.636536268517375, + "learning_rate": 1.517588551325556e-07, + "loss": 0.0637, + "num_tokens": 32323724.0, + "reward": 0.71832275390625, + "reward_std": 0.0035420539788901806, + "rewards//mean": 0.71832275390625, + "rewards//std": 0.03711748123168945, + "step": 3740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7482, + "grad_norm": 4.040902137756348, + "kl": 1.5739379804581404, + "learning_rate": 1.5153121567235333e-07, + "loss": 0.1574, + "num_tokens": 32332404.0, + "reward": 0.76641845703125, + "reward_std": 0.01204417459666729, + "rewards//mean": 0.76641845703125, + "rewards//std": 0.028204286471009254, + "step": 3741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7484, + "grad_norm": 3.644280433654785, + "kl": 1.5207661595195532, + "learning_rate": 1.5130371657533558e-07, + "loss": 0.1521, + "num_tokens": 32341076.0, + "reward": 0.7242431640625, + "reward_std": 0.008514964953064919, + "rewards//mean": 0.7242431640625, + "rewards//std": 0.023895155638456345, + "step": 3742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7486, + "grad_norm": 2.1579749584198, + "kl": 1.0293235592544079, + "learning_rate": 1.510763579331391e-07, + "loss": 0.1029, + "num_tokens": 32349684.0, + "reward": 0.73077392578125, + "reward_std": 0.009115541353821754, + "rewards//mean": 0.73077392578125, + "rewards//std": 0.04336043447256088, + "step": 3743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7488, + "grad_norm": 8.236746788024902, + "kl": 2.0344431828707457, + "learning_rate": 1.5084913983734393e-07, + "loss": 0.2034, + "num_tokens": 32358316.0, + "reward": 0.72259521484375, + "reward_std": 0.006815649569034576, + "rewards//mean": 0.72259521484375, + "rewards//std": 0.049187012016773224, + "step": 3744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.749, + "grad_norm": 2.1717538833618164, + "kl": 1.7864215727895498, + "learning_rate": 1.5062206237947362e-07, + "loss": 0.1786, + "num_tokens": 32366956.0, + "reward": 0.74847412109375, + "reward_std": 0.015851113945245743, + "rewards//mean": 0.74847412109375, + "rewards//std": 0.030045755207538605, + "step": 3745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7492, + "grad_norm": 1.8513010740280151, + "kl": 0.853464288637042, + "learning_rate": 1.5039512565099466e-07, + "loss": 0.0853, + "num_tokens": 32375500.0, + "reward": 0.72454833984375, + "reward_std": 0.0018086567288264632, + "rewards//mean": 0.72454833984375, + "rewards//std": 0.02588300220668316, + "step": 3746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7494, + "grad_norm": 3.092240333557129, + "kl": 0.8434834443032742, + "learning_rate": 1.5016832974331723e-07, + "loss": 0.0843, + "num_tokens": 32384140.0, + "reward": 0.7353515625, + "reward_std": 0.0027922862209379673, + "rewards//mean": 0.7353515625, + "rewards//std": 0.03764156624674797, + "step": 3747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7496, + "grad_norm": 8.905425071716309, + "kl": 3.1536107677966356, + "learning_rate": 1.499416747477948e-07, + "loss": 0.3154, + "num_tokens": 32392700.0, + "reward": 0.7435302734375, + "reward_std": 0.017181511968374252, + "rewards//mean": 0.7435302734375, + "rewards//std": 0.047123488038778305, + "step": 3748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7498, + "grad_norm": 1.7121204137802124, + "kl": 0.7822571042925119, + "learning_rate": 1.4971516075572405e-07, + "loss": 0.0782, + "num_tokens": 32401212.0, + "reward": 0.7515869140625, + "reward_std": 0.0027205084916204214, + "rewards//mean": 0.7515869140625, + "rewards//std": 0.015464809723198414, + "step": 3749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.75, + "grad_norm": 4.030794143676758, + "kl": 1.2744378801435232, + "learning_rate": 1.494887878583445e-07, + "loss": 0.1274, + "num_tokens": 32409860.0, + "reward": 0.76141357421875, + "reward_std": 0.007231231313198805, + "rewards//mean": 0.76141357421875, + "rewards//std": 0.030538978055119514, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7502, + "grad_norm": 1.8419945240020752, + "kl": 1.2095996253192425, + "learning_rate": 1.492625561468393e-07, + "loss": 0.121, + "num_tokens": 32418444.0, + "reward": 0.7642822265625, + "reward_std": 0.00647533405572176, + "rewards//mean": 0.7642822265625, + "rewards//std": 0.019884197041392326, + "step": 3751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7504, + "grad_norm": 7.768123626708984, + "kl": 1.878671832382679, + "learning_rate": 1.490364657123347e-07, + "loss": 0.1879, + "num_tokens": 32427068.0, + "reward": 0.7261962890625, + "reward_std": 0.012895622290670872, + "rewards//mean": 0.7261962890625, + "rewards//std": 0.03904835507273674, + "step": 3752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7506, + "grad_norm": 6.359330654144287, + "kl": 1.7379688825458288, + "learning_rate": 1.4881051664589956e-07, + "loss": 0.1738, + "num_tokens": 32435740.0, + "reward": 0.70745849609375, + "reward_std": 0.005499579012393951, + "rewards//mean": 0.70745849609375, + "rewards//std": 0.03580893203616142, + "step": 3753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7508, + "grad_norm": 0.8453320264816284, + "kl": 0.8101138416677713, + "learning_rate": 1.485847090385463e-07, + "loss": 0.081, + "num_tokens": 32444364.0, + "reward": 0.743408203125, + "reward_std": 0.003115184372290969, + "rewards//mean": 0.743408203125, + "rewards//std": 0.029494620859622955, + "step": 3754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.751, + "grad_norm": 5.848021030426025, + "kl": 1.1073383130133152, + "learning_rate": 1.4835904298123026e-07, + "loss": 0.1107, + "num_tokens": 32452940.0, + "reward": 0.763916015625, + "reward_std": 0.007665499113500118, + "rewards//mean": 0.763916015625, + "rewards//std": 0.028939202427864075, + "step": 3755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7512, + "grad_norm": 1.088681697845459, + "kl": 0.6268626544624567, + "learning_rate": 1.481335185648498e-07, + "loss": 0.0627, + "num_tokens": 32461564.0, + "reward": 0.7691650390625, + "reward_std": 0.002416868694126606, + "rewards//mean": 0.7691650390625, + "rewards//std": 0.03081856667995453, + "step": 3756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7514, + "grad_norm": 2.262465715408325, + "kl": 1.6962295807898045, + "learning_rate": 1.4790813588024581e-07, + "loss": 0.1696, + "num_tokens": 32470220.0, + "reward": 0.78253173828125, + "reward_std": 0.012020817026495934, + "rewards//mean": 0.78253173828125, + "rewards//std": 0.03115728124976158, + "step": 3757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7516, + "grad_norm": 1.11082923412323, + "kl": 0.8690585792064667, + "learning_rate": 1.4768289501820263e-07, + "loss": 0.0869, + "num_tokens": 32478788.0, + "reward": 0.760009765625, + "reward_std": 0.005477374419569969, + "rewards//mean": 0.760009765625, + "rewards//std": 0.028492173179984093, + "step": 3758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7518, + "grad_norm": 5.99229621887207, + "kl": 1.1431711483746767, + "learning_rate": 1.4745779606944714e-07, + "loss": 0.1143, + "num_tokens": 32487428.0, + "reward": 0.7340087890625, + "reward_std": 0.0031062725465744734, + "rewards//mean": 0.7340087890625, + "rewards//std": 0.028643455356359482, + "step": 3759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.752, + "grad_norm": 2.6186275482177734, + "kl": 1.7977796513587236, + "learning_rate": 1.472328391246494e-07, + "loss": 0.1798, + "num_tokens": 32496156.0, + "reward": 0.73565673828125, + "reward_std": 0.007994899526238441, + "rewards//mean": 0.73565673828125, + "rewards//std": 0.027472369372844696, + "step": 3760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7522, + "grad_norm": 1.8709213733673096, + "kl": 1.6359541863203049, + "learning_rate": 1.4700802427442178e-07, + "loss": 0.1636, + "num_tokens": 32504772.0, + "reward": 0.7552490234375, + "reward_std": 0.00914305355399847, + "rewards//mean": 0.7552490234375, + "rewards//std": 0.033991385251283646, + "step": 3761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7524, + "grad_norm": 2.4370055198669434, + "kl": 2.0486717894673347, + "learning_rate": 1.4678335160931972e-07, + "loss": 0.2049, + "num_tokens": 32513380.0, + "reward": 0.73992919921875, + "reward_std": 0.013181449845433235, + "rewards//mean": 0.73992919921875, + "rewards//std": 0.03820236772298813, + "step": 3762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7526, + "grad_norm": 2.999959707260132, + "kl": 2.315191576257348, + "learning_rate": 1.4655882121984136e-07, + "loss": 0.2315, + "num_tokens": 32521972.0, + "reward": 0.76239013671875, + "reward_std": 0.014452110975980759, + "rewards//mean": 0.76239013671875, + "rewards//std": 0.04138410463929176, + "step": 3763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7528, + "grad_norm": 1.9179967641830444, + "kl": 0.8019184991717339, + "learning_rate": 1.4633443319642792e-07, + "loss": 0.0802, + "num_tokens": 32530572.0, + "reward": 0.771728515625, + "reward_std": 0.005062445066869259, + "rewards//mean": 0.771728515625, + "rewards//std": 0.02683749608695507, + "step": 3764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.753, + "grad_norm": 6.095722675323486, + "kl": 1.5532522909343243, + "learning_rate": 1.4611018762946215e-07, + "loss": 0.1553, + "num_tokens": 32539188.0, + "reward": 0.74505615234375, + "reward_std": 0.008957553654909134, + "rewards//mean": 0.74505615234375, + "rewards//std": 0.039599742740392685, + "step": 3765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7532, + "grad_norm": 2.6907668113708496, + "kl": 1.295752689242363, + "learning_rate": 1.4588608460927048e-07, + "loss": 0.1296, + "num_tokens": 32547748.0, + "reward": 0.770751953125, + "reward_std": 0.01064821146428585, + "rewards//mean": 0.770751953125, + "rewards//std": 0.028762908652424812, + "step": 3766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7534, + "grad_norm": 10.922796249389648, + "kl": 2.8001274801790714, + "learning_rate": 1.4566212422612156e-07, + "loss": 0.28, + "num_tokens": 32556436.0, + "reward": 0.730712890625, + "reward_std": 0.017250798642635345, + "rewards//mean": 0.730712890625, + "rewards//std": 0.047597337514162064, + "step": 3767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7536, + "grad_norm": 11.01187801361084, + "kl": 2.432251665741205, + "learning_rate": 1.4543830657022682e-07, + "loss": 0.2432, + "num_tokens": 32565188.0, + "reward": 0.73077392578125, + "reward_std": 0.006905839778482914, + "rewards//mean": 0.73077392578125, + "rewards//std": 0.03625384345650673, + "step": 3768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7538, + "grad_norm": 7.631691932678223, + "kl": 2.284428743645549, + "learning_rate": 1.4521463173173965e-07, + "loss": 0.2284, + "num_tokens": 32573900.0, + "reward": 0.7774658203125, + "reward_std": 0.010101434774696827, + "rewards//mean": 0.7774658203125, + "rewards//std": 0.0429939329624176, + "step": 3769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.754, + "grad_norm": 4.028189182281494, + "kl": 2.0314571000635624, + "learning_rate": 1.4499109980075635e-07, + "loss": 0.2031, + "num_tokens": 32582548.0, + "reward": 0.799560546875, + "reward_std": 0.009979676455259323, + "rewards//mean": 0.799560546875, + "rewards//std": 0.023301472887396812, + "step": 3770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7542, + "grad_norm": 9.119534492492676, + "kl": 2.360508020967245, + "learning_rate": 1.4476771086731565e-07, + "loss": 0.2361, + "num_tokens": 32591172.0, + "reward": 0.76055908203125, + "reward_std": 0.006074434611946344, + "rewards//mean": 0.76055908203125, + "rewards//std": 0.018264736980199814, + "step": 3771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7544, + "grad_norm": 4.192183494567871, + "kl": 1.2719803005456924, + "learning_rate": 1.445444650213986e-07, + "loss": 0.1272, + "num_tokens": 32599708.0, + "reward": 0.7435302734375, + "reward_std": 0.006837446708232164, + "rewards//mean": 0.7435302734375, + "rewards//std": 0.027163511142134666, + "step": 3772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7546, + "grad_norm": 4.408973693847656, + "kl": 1.6436524875462055, + "learning_rate": 1.4432136235292846e-07, + "loss": 0.1644, + "num_tokens": 32608316.0, + "reward": 0.76788330078125, + "reward_std": 0.009042927995324135, + "rewards//mean": 0.76788330078125, + "rewards//std": 0.027645941823720932, + "step": 3773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7548, + "grad_norm": 9.720597267150879, + "kl": 2.3140203412622213, + "learning_rate": 1.44098402951771e-07, + "loss": 0.2314, + "num_tokens": 32616900.0, + "reward": 0.77398681640625, + "reward_std": 0.014772581867873669, + "rewards//mean": 0.77398681640625, + "rewards//std": 0.0422489270567894, + "step": 3774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.755, + "grad_norm": 1.682778239250183, + "kl": 1.5434564389288425, + "learning_rate": 1.4387558690773426e-07, + "loss": 0.1543, + "num_tokens": 32625516.0, + "reward": 0.75775146484375, + "reward_std": 0.010503279976546764, + "rewards//mean": 0.75775146484375, + "rewards//std": 0.029573488980531693, + "step": 3775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7552, + "grad_norm": 4.156941890716553, + "kl": 1.2965066730976105, + "learning_rate": 1.436529143105687e-07, + "loss": 0.1297, + "num_tokens": 32634140.0, + "reward": 0.7569580078125, + "reward_std": 0.006661273073405027, + "rewards//mean": 0.7569580078125, + "rewards//std": 0.030863720923662186, + "step": 3776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7554, + "grad_norm": 3.438358783721924, + "kl": 1.362759431824088, + "learning_rate": 1.434303852499664e-07, + "loss": 0.1363, + "num_tokens": 32642756.0, + "reward": 0.7237548828125, + "reward_std": 0.005725045222789049, + "rewards//mean": 0.7237548828125, + "rewards//std": 0.0367920845746994, + "step": 3777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7556, + "grad_norm": 2.3134765625, + "kl": 1.7298668827861547, + "learning_rate": 1.432079998155624e-07, + "loss": 0.173, + "num_tokens": 32651380.0, + "reward": 0.730712890625, + "reward_std": 0.0119344312697649, + "rewards//mean": 0.730712890625, + "rewards//std": 0.030168499797582626, + "step": 3778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7558, + "grad_norm": 6.425990581512451, + "kl": 1.8803725372999907, + "learning_rate": 1.4298575809693353e-07, + "loss": 0.188, + "num_tokens": 32659964.0, + "reward": 0.7430419921875, + "reward_std": 0.012739241123199463, + "rewards//mean": 0.7430419921875, + "rewards//std": 0.02731688879430294, + "step": 3779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.756, + "grad_norm": 3.652817726135254, + "kl": 2.3078398667275906, + "learning_rate": 1.4276366018359842e-07, + "loss": 0.2308, + "num_tokens": 32668628.0, + "reward": 0.76513671875, + "reward_std": 0.016831163316965103, + "rewards//mean": 0.76513671875, + "rewards//std": 0.034893475472927094, + "step": 3780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7562, + "grad_norm": 3.939563512802124, + "kl": 1.1382131278514862, + "learning_rate": 1.4254170616501827e-07, + "loss": 0.1138, + "num_tokens": 32677372.0, + "reward": 0.76275634765625, + "reward_std": 0.011067896150052547, + "rewards//mean": 0.76275634765625, + "rewards//std": 0.02759716659784317, + "step": 3781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7564, + "grad_norm": 4.1391921043396, + "kl": 2.0712663140147924, + "learning_rate": 1.4231989613059614e-07, + "loss": 0.2071, + "num_tokens": 32686012.0, + "reward": 0.76605224609375, + "reward_std": 0.01823597401380539, + "rewards//mean": 0.76605224609375, + "rewards//std": 0.03867063671350479, + "step": 3782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7566, + "grad_norm": 3.765363931655884, + "kl": 1.2633594367653131, + "learning_rate": 1.420982301696772e-07, + "loss": 0.1263, + "num_tokens": 32694724.0, + "reward": 0.76495361328125, + "reward_std": 0.007359200157225132, + "rewards//mean": 0.76495361328125, + "rewards//std": 0.020734917372465134, + "step": 3783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7568, + "grad_norm": 4.97023344039917, + "kl": 1.8460232391953468, + "learning_rate": 1.4187670837154824e-07, + "loss": 0.1846, + "num_tokens": 32703364.0, + "reward": 0.8052978515625, + "reward_std": 0.011903153732419014, + "rewards//mean": 0.8052978515625, + "rewards//std": 0.035904183983802795, + "step": 3784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.757, + "grad_norm": 5.849531173706055, + "kl": 1.2989363986998796, + "learning_rate": 1.4165533082543828e-07, + "loss": 0.1299, + "num_tokens": 32711972.0, + "reward": 0.75152587890625, + "reward_std": 0.006052725948393345, + "rewards//mean": 0.75152587890625, + "rewards//std": 0.02721160277724266, + "step": 3785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7572, + "grad_norm": 4.836897850036621, + "kl": 1.033550651744008, + "learning_rate": 1.414340976205183e-07, + "loss": 0.1034, + "num_tokens": 32720612.0, + "reward": 0.7501220703125, + "reward_std": 0.0044970386661589146, + "rewards//mean": 0.7501220703125, + "rewards//std": 0.028923245146870613, + "step": 3786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7574, + "grad_norm": 2.363436698913574, + "kl": 1.0539420768618584, + "learning_rate": 1.4121300884590098e-07, + "loss": 0.1054, + "num_tokens": 32729268.0, + "reward": 0.76287841796875, + "reward_std": 0.003251514630392194, + "rewards//mean": 0.76287841796875, + "rewards//std": 0.016332561150193214, + "step": 3787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7576, + "grad_norm": 1.0397495031356812, + "kl": 0.8341781925410032, + "learning_rate": 1.4099206459064062e-07, + "loss": 0.0834, + "num_tokens": 32737868.0, + "reward": 0.7430419921875, + "reward_std": 0.004140119068324566, + "rewards//mean": 0.7430419921875, + "rewards//std": 0.028043145313858986, + "step": 3788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7578, + "grad_norm": 17.706974029541016, + "kl": 3.852318063378334, + "learning_rate": 1.4077126494373376e-07, + "loss": 0.3852, + "num_tokens": 32746628.0, + "reward": 0.7508544921875, + "reward_std": 0.013303064741194248, + "rewards//mean": 0.7508544921875, + "rewards//std": 0.05100952833890915, + "step": 3789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.758, + "grad_norm": 22.017160415649414, + "kl": 4.1642041858285666, + "learning_rate": 1.4055060999411838e-07, + "loss": 0.4164, + "num_tokens": 32755348.0, + "reward": 0.737548828125, + "reward_std": 0.015501863323152065, + "rewards//mean": 0.737548828125, + "rewards//std": 0.042500466108322144, + "step": 3790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7582, + "grad_norm": 17.612506866455078, + "kl": 2.6022265516221523, + "learning_rate": 1.4033009983067452e-07, + "loss": 0.2602, + "num_tokens": 32763964.0, + "reward": 0.74365234375, + "reward_std": 0.011014536023139954, + "rewards//mean": 0.74365234375, + "rewards//std": 0.038117922842502594, + "step": 3791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7584, + "grad_norm": 10.17664909362793, + "kl": 2.4308334048837423, + "learning_rate": 1.4010973454222323e-07, + "loss": 0.2431, + "num_tokens": 32772548.0, + "reward": 0.7015380859375, + "reward_std": 0.010064586997032166, + "rewards//mean": 0.7015380859375, + "rewards//std": 0.047366999089717865, + "step": 3792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7586, + "grad_norm": 7.027637004852295, + "kl": 1.9163692835718393, + "learning_rate": 1.3988951421752788e-07, + "loss": 0.1916, + "num_tokens": 32781236.0, + "reward": 0.7694091796875, + "reward_std": 0.012211175635457039, + "rewards//mean": 0.7694091796875, + "rewards//std": 0.03141209855675697, + "step": 3793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7588, + "grad_norm": 9.107929229736328, + "kl": 2.0741945635527372, + "learning_rate": 1.396694389452931e-07, + "loss": 0.2074, + "num_tokens": 32789924.0, + "reward": 0.7489013671875, + "reward_std": 0.009641960263252258, + "rewards//mean": 0.7489013671875, + "rewards//std": 0.03104368969798088, + "step": 3794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.759, + "grad_norm": 13.850433349609375, + "kl": 0.9784563649445772, + "learning_rate": 1.394495088141654e-07, + "loss": 0.0978, + "num_tokens": 32798540.0, + "reward": 0.748291015625, + "reward_std": 0.00568180438131094, + "rewards//mean": 0.748291015625, + "rewards//std": 0.027204997837543488, + "step": 3795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7592, + "grad_norm": 16.362194061279297, + "kl": 2.9672423228621483, + "learning_rate": 1.3922972391273225e-07, + "loss": 0.2967, + "num_tokens": 32807468.0, + "reward": 0.7440185546875, + "reward_std": 0.010563543066382408, + "rewards//mean": 0.7440185546875, + "rewards//std": 0.04043933376669884, + "step": 3796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7594, + "grad_norm": 10.073771476745605, + "kl": 2.6958575937896967, + "learning_rate": 1.3901008432952322e-07, + "loss": 0.2696, + "num_tokens": 32816100.0, + "reward": 0.7969970703125, + "reward_std": 0.0181894451379776, + "rewards//mean": 0.7969970703125, + "rewards//std": 0.0313735231757164, + "step": 3797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7596, + "grad_norm": 4.0371479988098145, + "kl": 1.4425696786493063, + "learning_rate": 1.3879059015300915e-07, + "loss": 0.1443, + "num_tokens": 32824724.0, + "reward": 0.76019287109375, + "reward_std": 0.013683244585990906, + "rewards//mean": 0.76019287109375, + "rewards//std": 0.04208090901374817, + "step": 3798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7598, + "grad_norm": 4.628749370574951, + "kl": 1.8947123996913433, + "learning_rate": 1.3857124147160204e-07, + "loss": 0.1895, + "num_tokens": 32833380.0, + "reward": 0.7220458984375, + "reward_std": 0.009501872584223747, + "rewards//mean": 0.7220458984375, + "rewards//std": 0.044178929179906845, + "step": 3799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.76, + "grad_norm": 5.843815803527832, + "kl": 2.464186219498515, + "learning_rate": 1.3835203837365561e-07, + "loss": 0.2464, + "num_tokens": 32842100.0, + "reward": 0.73590087890625, + "reward_std": 0.014534153044223785, + "rewards//mean": 0.73590087890625, + "rewards//std": 0.04318307340145111, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7602, + "grad_norm": 9.040116310119629, + "kl": 2.997537974268198, + "learning_rate": 1.381329809474649e-07, + "loss": 0.2998, + "num_tokens": 32850748.0, + "reward": 0.742919921875, + "reward_std": 0.014470092952251434, + "rewards//mean": 0.742919921875, + "rewards//std": 0.03791644051671028, + "step": 3801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7604, + "grad_norm": 20.051992416381836, + "kl": 1.9462823774665594, + "learning_rate": 1.3791406928126635e-07, + "loss": 0.1946, + "num_tokens": 32859388.0, + "reward": 0.7760009765625, + "reward_std": 0.009369083680212498, + "rewards//mean": 0.7760009765625, + "rewards//std": 0.035374097526073456, + "step": 3802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7606, + "grad_norm": 5.591738224029541, + "kl": 1.097462935373187, + "learning_rate": 1.3769530346323721e-07, + "loss": 0.1097, + "num_tokens": 32867940.0, + "reward": 0.7882080078125, + "reward_std": 0.00509650120511651, + "rewards//mean": 0.7882080078125, + "rewards//std": 0.021959304809570312, + "step": 3803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7608, + "grad_norm": 3.1748239994049072, + "kl": 1.4979219771921635, + "learning_rate": 1.3747668358149656e-07, + "loss": 0.1498, + "num_tokens": 32876564.0, + "reward": 0.75653076171875, + "reward_std": 0.006211716216057539, + "rewards//mean": 0.75653076171875, + "rewards//std": 0.030679430812597275, + "step": 3804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.761, + "grad_norm": 4.102215766906738, + "kl": 1.3789365161210299, + "learning_rate": 1.3725820972410434e-07, + "loss": 0.1379, + "num_tokens": 32885196.0, + "reward": 0.7711181640625, + "reward_std": 0.012195631861686707, + "rewards//mean": 0.7711181640625, + "rewards//std": 0.02945224940776825, + "step": 3805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7612, + "grad_norm": 3.6879026889801025, + "kl": 1.717773512005806, + "learning_rate": 1.3703988197906207e-07, + "loss": 0.1718, + "num_tokens": 32893716.0, + "reward": 0.72723388671875, + "reward_std": 0.008627040311694145, + "rewards//mean": 0.72723388671875, + "rewards//std": 0.02692471072077751, + "step": 3806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7614, + "grad_norm": 2.225463628768921, + "kl": 1.4426934830844402, + "learning_rate": 1.3682170043431173e-07, + "loss": 0.1443, + "num_tokens": 32902396.0, + "reward": 0.72772216796875, + "reward_std": 0.006584385875612497, + "rewards//mean": 0.72772216796875, + "rewards//std": 0.031716812402009964, + "step": 3807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7616, + "grad_norm": 1.3565641641616821, + "kl": 1.2816112246364355, + "learning_rate": 1.3660366517773708e-07, + "loss": 0.1282, + "num_tokens": 32911156.0, + "reward": 0.77008056640625, + "reward_std": 0.006273115053772926, + "rewards//mean": 0.77008056640625, + "rewards//std": 0.02499638870358467, + "step": 3808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7618, + "grad_norm": 5.5172271728515625, + "kl": 1.2453759163618088, + "learning_rate": 1.3638577629716263e-07, + "loss": 0.1245, + "num_tokens": 32919780.0, + "reward": 0.77978515625, + "reward_std": 0.005890419241040945, + "rewards//mean": 0.77978515625, + "rewards//std": 0.023872656747698784, + "step": 3809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.762, + "grad_norm": 8.676848411560059, + "kl": 2.180679567158222, + "learning_rate": 1.3616803388035413e-07, + "loss": 0.2181, + "num_tokens": 32928364.0, + "reward": 0.76141357421875, + "reward_std": 0.01069190725684166, + "rewards//mean": 0.76141357421875, + "rewards//std": 0.03419092670083046, + "step": 3810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7622, + "grad_norm": 4.261361598968506, + "kl": 0.578357856720686, + "learning_rate": 1.3595043801501794e-07, + "loss": 0.0578, + "num_tokens": 32936892.0, + "reward": 0.74615478515625, + "reward_std": 0.0014694316778331995, + "rewards//mean": 0.74615478515625, + "rewards//std": 0.028700673952698708, + "step": 3811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7624, + "grad_norm": 14.22526741027832, + "kl": 2.7361455522477627, + "learning_rate": 1.3573298878880179e-07, + "loss": 0.2736, + "num_tokens": 32945700.0, + "reward": 0.7532958984375, + "reward_std": 0.009671827778220177, + "rewards//mean": 0.7532958984375, + "rewards//std": 0.02432212419807911, + "step": 3812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7626, + "grad_norm": 14.056611061096191, + "kl": 3.350207319483161, + "learning_rate": 1.3551568628929432e-07, + "loss": 0.335, + "num_tokens": 32954380.0, + "reward": 0.75177001953125, + "reward_std": 0.00874075386673212, + "rewards//mean": 0.75177001953125, + "rewards//std": 0.027249960228800774, + "step": 3813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7628, + "grad_norm": 2.423682928085327, + "kl": 1.684755228459835, + "learning_rate": 1.352985306040247e-07, + "loss": 0.1685, + "num_tokens": 32963012.0, + "reward": 0.7427978515625, + "reward_std": 0.00968673825263977, + "rewards//mean": 0.7427978515625, + "rewards//std": 0.034752413630485535, + "step": 3814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.763, + "grad_norm": 2.457785129547119, + "kl": 1.5017896834760904, + "learning_rate": 1.3508152182046335e-07, + "loss": 0.1502, + "num_tokens": 32971628.0, + "reward": 0.7567138671875, + "reward_std": 0.007898999378085136, + "rewards//mean": 0.7567138671875, + "rewards//std": 0.04127812013030052, + "step": 3815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7632, + "grad_norm": 4.576962947845459, + "kl": 1.3891922570765018, + "learning_rate": 1.3486466002602132e-07, + "loss": 0.1389, + "num_tokens": 32980220.0, + "reward": 0.771484375, + "reward_std": 0.012224636040627956, + "rewards//mean": 0.771484375, + "rewards//std": 0.02561972290277481, + "step": 3816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7634, + "grad_norm": 7.008418083190918, + "kl": 1.6953949760645628, + "learning_rate": 1.3464794530805073e-07, + "loss": 0.1695, + "num_tokens": 32988892.0, + "reward": 0.72796630859375, + "reward_std": 0.01646377705037594, + "rewards//mean": 0.72796630859375, + "rewards//std": 0.03896544873714447, + "step": 3817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7636, + "grad_norm": 4.152071952819824, + "kl": 0.947254903614521, + "learning_rate": 1.3443137775384396e-07, + "loss": 0.0947, + "num_tokens": 32997516.0, + "reward": 0.78173828125, + "reward_std": 0.003922185394912958, + "rewards//mean": 0.78173828125, + "rewards//std": 0.030269687995314598, + "step": 3818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7638, + "grad_norm": 6.054388046264648, + "kl": 1.9691888224333525, + "learning_rate": 1.342149574506345e-07, + "loss": 0.1969, + "num_tokens": 33006228.0, + "reward": 0.75732421875, + "reward_std": 0.010939767584204674, + "rewards//mean": 0.75732421875, + "rewards//std": 0.032049696892499924, + "step": 3819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.764, + "grad_norm": 3.9398393630981445, + "kl": 1.5680154897272587, + "learning_rate": 1.3399868448559636e-07, + "loss": 0.1568, + "num_tokens": 33014908.0, + "reward": 0.7923583984375, + "reward_std": 0.007671463303267956, + "rewards//mean": 0.7923583984375, + "rewards//std": 0.03245311602950096, + "step": 3820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7642, + "grad_norm": 7.310879230499268, + "kl": 1.0349591486155987, + "learning_rate": 1.3378255894584462e-07, + "loss": 0.1035, + "num_tokens": 33023540.0, + "reward": 0.75640869140625, + "reward_std": 0.008131724782288074, + "rewards//mean": 0.75640869140625, + "rewards//std": 0.030823661014437675, + "step": 3821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7644, + "grad_norm": 11.91458511352539, + "kl": 2.225643003359437, + "learning_rate": 1.335665809184341e-07, + "loss": 0.2226, + "num_tokens": 33032324.0, + "reward": 0.7493896484375, + "reward_std": 0.006475863978266716, + "rewards//mean": 0.7493896484375, + "rewards//std": 0.023408742621541023, + "step": 3822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7646, + "grad_norm": 2.509631633758545, + "kl": 1.8514486886560917, + "learning_rate": 1.3335075049036099e-07, + "loss": 0.1851, + "num_tokens": 33040860.0, + "reward": 0.7437744140625, + "reward_std": 0.011470258235931396, + "rewards//mean": 0.7437744140625, + "rewards//std": 0.03451814875006676, + "step": 3823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7648, + "grad_norm": 3.968416213989258, + "kl": 1.094913860782981, + "learning_rate": 1.3313506774856175e-07, + "loss": 0.1095, + "num_tokens": 33049580.0, + "reward": 0.7340087890625, + "reward_std": 0.006548613775521517, + "rewards//mean": 0.7340087890625, + "rewards//std": 0.03192262724041939, + "step": 3824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.765, + "grad_norm": 2.2487804889678955, + "kl": 1.4780880883336067, + "learning_rate": 1.3291953277991347e-07, + "loss": 0.1478, + "num_tokens": 33058228.0, + "reward": 0.72100830078125, + "reward_std": 0.010310685262084007, + "rewards//mean": 0.72100830078125, + "rewards//std": 0.031892914324998856, + "step": 3825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7652, + "grad_norm": 6.08770751953125, + "kl": 2.2755823619663715, + "learning_rate": 1.327041456712334e-07, + "loss": 0.2276, + "num_tokens": 33066932.0, + "reward": 0.74835205078125, + "reward_std": 0.015277864411473274, + "rewards//mean": 0.74835205078125, + "rewards//std": 0.04231158271431923, + "step": 3826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7654, + "grad_norm": 7.314813137054443, + "kl": 1.6093310620635748, + "learning_rate": 1.3248890650927945e-07, + "loss": 0.1609, + "num_tokens": 33075612.0, + "reward": 0.7476806640625, + "reward_std": 0.006037743762135506, + "rewards//mean": 0.7476806640625, + "rewards//std": 0.03438457474112511, + "step": 3827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7656, + "grad_norm": 3.5261342525482178, + "kl": 0.6048196014016867, + "learning_rate": 1.3227381538075023e-07, + "loss": 0.0605, + "num_tokens": 33084268.0, + "reward": 0.76715087890625, + "reward_std": 0.003731552977114916, + "rewards//mean": 0.76715087890625, + "rewards//std": 0.02571048028767109, + "step": 3828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7658, + "grad_norm": 5.768620014190674, + "kl": 1.392198096960783, + "learning_rate": 1.3205887237228397e-07, + "loss": 0.1392, + "num_tokens": 33092868.0, + "reward": 0.77032470703125, + "reward_std": 0.012026659213006496, + "rewards//mean": 0.77032470703125, + "rewards//std": 0.026228679344058037, + "step": 3829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.766, + "grad_norm": 5.63932466506958, + "kl": 1.3853070233017206, + "learning_rate": 1.3184407757045995e-07, + "loss": 0.1385, + "num_tokens": 33101476.0, + "reward": 0.76556396484375, + "reward_std": 0.014114229939877987, + "rewards//mean": 0.76556396484375, + "rewards//std": 0.030445151031017303, + "step": 3830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7662, + "grad_norm": 3.136876106262207, + "kl": 2.158704224973917, + "learning_rate": 1.3162943106179748e-07, + "loss": 0.2159, + "num_tokens": 33110116.0, + "reward": 0.745361328125, + "reward_std": 0.01068568229675293, + "rewards//mean": 0.745361328125, + "rewards//std": 0.035969264805316925, + "step": 3831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7664, + "grad_norm": 6.753223896026611, + "kl": 0.5993874557316303, + "learning_rate": 1.314149329327563e-07, + "loss": 0.0599, + "num_tokens": 33118676.0, + "reward": 0.7615966796875, + "reward_std": 0.0023404296953231096, + "rewards//mean": 0.7615966796875, + "rewards//std": 0.02150796540081501, + "step": 3832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7666, + "grad_norm": 9.286118507385254, + "kl": 1.8374832309782505, + "learning_rate": 1.3120058326973582e-07, + "loss": 0.1837, + "num_tokens": 33127380.0, + "reward": 0.7705078125, + "reward_std": 0.015113026835024357, + "rewards//mean": 0.7705078125, + "rewards//std": 0.03782770782709122, + "step": 3833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7668, + "grad_norm": 8.53174877166748, + "kl": 1.7777348682284355, + "learning_rate": 1.3098638215907638e-07, + "loss": 0.1778, + "num_tokens": 33136060.0, + "reward": 0.7574462890625, + "reward_std": 0.009057469666004181, + "rewards//mean": 0.7574462890625, + "rewards//std": 0.026736624538898468, + "step": 3834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.767, + "grad_norm": 5.330288410186768, + "kl": 1.097549520432949, + "learning_rate": 1.3077232968705805e-07, + "loss": 0.1098, + "num_tokens": 33144612.0, + "reward": 0.7327880859375, + "reward_std": 0.007091484498232603, + "rewards//mean": 0.7327880859375, + "rewards//std": 0.03541857376694679, + "step": 3835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7672, + "grad_norm": 6.398611068725586, + "kl": 1.8445174898952246, + "learning_rate": 1.305584259399013e-07, + "loss": 0.1845, + "num_tokens": 33153364.0, + "reward": 0.77398681640625, + "reward_std": 0.008683791384100914, + "rewards//mean": 0.77398681640625, + "rewards//std": 0.03283631429076195, + "step": 3836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7674, + "grad_norm": 6.837332725524902, + "kl": 1.620683589950204, + "learning_rate": 1.3034467100376622e-07, + "loss": 0.1621, + "num_tokens": 33161988.0, + "reward": 0.74810791015625, + "reward_std": 0.008285700343549252, + "rewards//mean": 0.74810791015625, + "rewards//std": 0.022325994446873665, + "step": 3837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 127.34375, + "epoch": 0.7676, + "grad_norm": 8.560009002685547, + "kl": 2.0264624636620283, + "learning_rate": 1.3013106496475352e-07, + "loss": 0.1957, + "num_tokens": 33170626.0, + "reward": 0.80230712890625, + "reward_std": 0.0066723814234137535, + "rewards//mean": 0.80230712890625, + "rewards//std": 0.015788720920681953, + "step": 3838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7678, + "grad_norm": 3.4256339073181152, + "kl": 1.4840133432298899, + "learning_rate": 1.299176079089036e-07, + "loss": 0.1484, + "num_tokens": 33179210.0, + "reward": 0.7698974609375, + "reward_std": 0.012859897688031197, + "rewards//mean": 0.7698974609375, + "rewards//std": 0.025139881297945976, + "step": 3839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.768, + "grad_norm": 12.228135108947754, + "kl": 2.3034783229231834, + "learning_rate": 1.2970429992219712e-07, + "loss": 0.2303, + "num_tokens": 33187922.0, + "reward": 0.74859619140625, + "reward_std": 0.011356012895703316, + "rewards//mean": 0.74859619140625, + "rewards//std": 0.03204869106411934, + "step": 3840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7682, + "grad_norm": 13.251349449157715, + "kl": 1.6303869597613811, + "learning_rate": 1.2949114109055414e-07, + "loss": 0.163, + "num_tokens": 33196506.0, + "reward": 0.728271484375, + "reward_std": 0.008104080334305763, + "rewards//mean": 0.728271484375, + "rewards//std": 0.03854992985725403, + "step": 3841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7684, + "grad_norm": 3.585341691970825, + "kl": 1.2887060474604368, + "learning_rate": 1.2927813149983525e-07, + "loss": 0.1289, + "num_tokens": 33205154.0, + "reward": 0.75445556640625, + "reward_std": 0.009880157187581062, + "rewards//mean": 0.75445556640625, + "rewards//std": 0.04209960997104645, + "step": 3842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7686, + "grad_norm": 3.4269046783447266, + "kl": 1.6432447992265224, + "learning_rate": 1.2906527123584081e-07, + "loss": 0.1643, + "num_tokens": 33213882.0, + "reward": 0.729736328125, + "reward_std": 0.010720941238105297, + "rewards//mean": 0.729736328125, + "rewards//std": 0.03217320516705513, + "step": 3843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7688, + "grad_norm": 14.91458511352539, + "kl": 2.987995255738497, + "learning_rate": 1.2885256038431064e-07, + "loss": 0.2988, + "num_tokens": 33222522.0, + "reward": 0.7701416015625, + "reward_std": 0.014191554859280586, + "rewards//mean": 0.7701416015625, + "rewards//std": 0.03650625795125961, + "step": 3844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.769, + "grad_norm": 7.5672287940979, + "kl": 1.2307368107140064, + "learning_rate": 1.286399990309247e-07, + "loss": 0.1231, + "num_tokens": 33231266.0, + "reward": 0.71673583984375, + "reward_std": 0.009703254327178001, + "rewards//mean": 0.71673583984375, + "rewards//std": 0.03913678601384163, + "step": 3845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7692, + "grad_norm": 11.405426025390625, + "kl": 2.117430228739977, + "learning_rate": 1.284275872613028e-07, + "loss": 0.2117, + "num_tokens": 33239890.0, + "reward": 0.76922607421875, + "reward_std": 0.00962196197360754, + "rewards//mean": 0.76922607421875, + "rewards//std": 0.03450120612978935, + "step": 3846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7694, + "grad_norm": 9.609193801879883, + "kl": 1.5075704138725996, + "learning_rate": 1.2821532516100447e-07, + "loss": 0.1508, + "num_tokens": 33248642.0, + "reward": 0.7677001953125, + "reward_std": 0.00876168254762888, + "rewards//mean": 0.7677001953125, + "rewards//std": 0.03164256736636162, + "step": 3847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7696, + "grad_norm": 5.812592029571533, + "kl": 1.7598407082259655, + "learning_rate": 1.280032128155285e-07, + "loss": 0.176, + "num_tokens": 33257282.0, + "reward": 0.7479248046875, + "reward_std": 0.008533084765076637, + "rewards//mean": 0.7479248046875, + "rewards//std": 0.031535226851701736, + "step": 3848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7698, + "grad_norm": 4.1324992179870605, + "kl": 1.193206051364541, + "learning_rate": 1.2779125031031414e-07, + "loss": 0.1193, + "num_tokens": 33265922.0, + "reward": 0.76666259765625, + "reward_std": 0.007270403206348419, + "rewards//mean": 0.76666259765625, + "rewards//std": 0.02589469589293003, + "step": 3849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.77, + "grad_norm": 7.049158096313477, + "kl": 1.2254748307168484, + "learning_rate": 1.2757943773073943e-07, + "loss": 0.1225, + "num_tokens": 33274634.0, + "reward": 0.71630859375, + "reward_std": 0.006141458638012409, + "rewards//mean": 0.71630859375, + "rewards//std": 0.0297206062823534, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7702, + "grad_norm": 5.917280197143555, + "kl": 2.986080203205347, + "learning_rate": 1.2736777516212267e-07, + "loss": 0.2986, + "num_tokens": 33283322.0, + "reward": 0.76043701171875, + "reward_std": 0.025417130440473557, + "rewards//mean": 0.76043701171875, + "rewards//std": 0.04005470499396324, + "step": 3851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7704, + "grad_norm": 1.188199520111084, + "kl": 1.4054109957069159, + "learning_rate": 1.2715626268972167e-07, + "loss": 0.1405, + "num_tokens": 33291906.0, + "reward": 0.7635498046875, + "reward_std": 0.008255021646618843, + "rewards//mean": 0.7635498046875, + "rewards//std": 0.02872999757528305, + "step": 3852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7706, + "grad_norm": 6.564970970153809, + "kl": 2.3109420109540224, + "learning_rate": 1.2694490039873333e-07, + "loss": 0.2311, + "num_tokens": 33300642.0, + "reward": 0.75140380859375, + "reward_std": 0.015848973765969276, + "rewards//mean": 0.75140380859375, + "rewards//std": 0.034141745418310165, + "step": 3853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7708, + "grad_norm": 10.526966094970703, + "kl": 0.9214737936854362, + "learning_rate": 1.267336883742945e-07, + "loss": 0.0921, + "num_tokens": 33309202.0, + "reward": 0.73699951171875, + "reward_std": 0.003394263330847025, + "rewards//mean": 0.73699951171875, + "rewards//std": 0.027375219389796257, + "step": 3854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.771, + "grad_norm": 5.909029006958008, + "kl": 1.1711606364697218, + "learning_rate": 1.2652262670148134e-07, + "loss": 0.1171, + "num_tokens": 33317842.0, + "reward": 0.74420166015625, + "reward_std": 0.011669214814901352, + "rewards//mean": 0.74420166015625, + "rewards//std": 0.033291399478912354, + "step": 3855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7712, + "grad_norm": 6.115334510803223, + "kl": 1.9689525179564953, + "learning_rate": 1.2631171546530966e-07, + "loss": 0.1969, + "num_tokens": 33326378.0, + "reward": 0.760009765625, + "reward_std": 0.022591624408960342, + "rewards//mean": 0.760009765625, + "rewards//std": 0.04139183089137077, + "step": 3856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7714, + "grad_norm": 4.440582275390625, + "kl": 1.9442737326025963, + "learning_rate": 1.2610095475073413e-07, + "loss": 0.1944, + "num_tokens": 33335090.0, + "reward": 0.7723388671875, + "reward_std": 0.013531842269003391, + "rewards//mean": 0.7723388671875, + "rewards//std": 0.031323302537202835, + "step": 3857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7716, + "grad_norm": 2.7844839096069336, + "kl": 2.0343784373253584, + "learning_rate": 1.258903446426493e-07, + "loss": 0.2034, + "num_tokens": 33343762.0, + "reward": 0.75921630859375, + "reward_std": 0.011886494234204292, + "rewards//mean": 0.75921630859375, + "rewards//std": 0.026119371876120567, + "step": 3858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7718, + "grad_norm": 10.772706031799316, + "kl": 2.618398081511259, + "learning_rate": 1.2567988522588908e-07, + "loss": 0.2618, + "num_tokens": 33352466.0, + "reward": 0.76409912109375, + "reward_std": 0.016156960278749466, + "rewards//mean": 0.76409912109375, + "rewards//std": 0.035994451493024826, + "step": 3859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.772, + "grad_norm": 3.3371238708496094, + "kl": 1.084117479622364, + "learning_rate": 1.2546957658522618e-07, + "loss": 0.1084, + "num_tokens": 33361066.0, + "reward": 0.74755859375, + "reward_std": 0.009804603643715382, + "rewards//mean": 0.74755859375, + "rewards//std": 0.03190578520298004, + "step": 3860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7722, + "grad_norm": 14.782986640930176, + "kl": 2.573685735464096, + "learning_rate": 1.2525941880537304e-07, + "loss": 0.2574, + "num_tokens": 33369746.0, + "reward": 0.74908447265625, + "reward_std": 0.008148198947310448, + "rewards//mean": 0.74908447265625, + "rewards//std": 0.0268092080950737, + "step": 3861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7724, + "grad_norm": 9.644927024841309, + "kl": 1.797487584874034, + "learning_rate": 1.250494119709812e-07, + "loss": 0.1797, + "num_tokens": 33378362.0, + "reward": 0.75030517578125, + "reward_std": 0.009189760312438011, + "rewards//mean": 0.75030517578125, + "rewards//std": 0.042945098131895065, + "step": 3862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7726, + "grad_norm": 18.490568161010742, + "kl": 2.521689286455512, + "learning_rate": 1.2483955616664148e-07, + "loss": 0.2522, + "num_tokens": 33387018.0, + "reward": 0.74322509765625, + "reward_std": 0.01077241636812687, + "rewards//mean": 0.74322509765625, + "rewards//std": 0.03703337162733078, + "step": 3863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7728, + "grad_norm": 8.542414665222168, + "kl": 2.120613921433687, + "learning_rate": 1.2462985147688359e-07, + "loss": 0.2121, + "num_tokens": 33395634.0, + "reward": 0.74395751953125, + "reward_std": 0.009068668819963932, + "rewards//mean": 0.74395751953125, + "rewards//std": 0.03863813355565071, + "step": 3864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.773, + "grad_norm": 4.563297748565674, + "kl": 1.4008376970887184, + "learning_rate": 1.244202979861766e-07, + "loss": 0.1401, + "num_tokens": 33404362.0, + "reward": 0.77642822265625, + "reward_std": 0.010009217076003551, + "rewards//mean": 0.77642822265625, + "rewards//std": 0.027924340218305588, + "step": 3865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7732, + "grad_norm": 8.094592094421387, + "kl": 1.086251249536872, + "learning_rate": 1.2421089577892868e-07, + "loss": 0.1086, + "num_tokens": 33413042.0, + "reward": 0.76470947265625, + "reward_std": 0.009057670831680298, + "rewards//mean": 0.76470947265625, + "rewards//std": 0.04076012596487999, + "step": 3866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7734, + "grad_norm": 7.479476451873779, + "kl": 2.6798534374684095, + "learning_rate": 1.240016449394871e-07, + "loss": 0.268, + "num_tokens": 33421738.0, + "reward": 0.7169189453125, + "reward_std": 0.009325915016233921, + "rewards//mean": 0.7169189453125, + "rewards//std": 0.040247220546007156, + "step": 3867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7736, + "grad_norm": 6.100898265838623, + "kl": 2.869092285633087, + "learning_rate": 1.2379254555213786e-07, + "loss": 0.2869, + "num_tokens": 33430466.0, + "reward": 0.75836181640625, + "reward_std": 0.015933100134134293, + "rewards//mean": 0.75836181640625, + "rewards//std": 0.026737967506051064, + "step": 3868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7738, + "grad_norm": 8.428766250610352, + "kl": 2.1594008412212133, + "learning_rate": 1.2358359770110632e-07, + "loss": 0.2159, + "num_tokens": 33439130.0, + "reward": 0.76727294921875, + "reward_std": 0.012170176953077316, + "rewards//mean": 0.76727294921875, + "rewards//std": 0.02929629571735859, + "step": 3869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.774, + "grad_norm": 3.4925875663757324, + "kl": 1.5172917488962412, + "learning_rate": 1.2337480147055658e-07, + "loss": 0.1517, + "num_tokens": 33447722.0, + "reward": 0.71728515625, + "reward_std": 0.008411338552832603, + "rewards//mean": 0.71728515625, + "rewards//std": 0.030848268419504166, + "step": 3870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7742, + "grad_norm": 3.011601209640503, + "kl": 1.017035698518157, + "learning_rate": 1.2316615694459186e-07, + "loss": 0.1017, + "num_tokens": 33456258.0, + "reward": 0.74786376953125, + "reward_std": 0.009441666305065155, + "rewards//mean": 0.74786376953125, + "rewards//std": 0.018726341426372528, + "step": 3871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7744, + "grad_norm": 4.106814861297607, + "kl": 1.1321406867355108, + "learning_rate": 1.2295766420725401e-07, + "loss": 0.1132, + "num_tokens": 33464874.0, + "reward": 0.76568603515625, + "reward_std": 0.009720061905682087, + "rewards//mean": 0.76568603515625, + "rewards//std": 0.031308017671108246, + "step": 3872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7746, + "grad_norm": 1.5781792402267456, + "kl": 1.2482231557369232, + "learning_rate": 1.2274932334252386e-07, + "loss": 0.1248, + "num_tokens": 33473498.0, + "reward": 0.74737548828125, + "reward_std": 0.006058079656213522, + "rewards//mean": 0.74737548828125, + "rewards//std": 0.032429493963718414, + "step": 3873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7748, + "grad_norm": 2.612637519836426, + "kl": 1.9989351071417332, + "learning_rate": 1.225411344343213e-07, + "loss": 0.1999, + "num_tokens": 33482130.0, + "reward": 0.770751953125, + "reward_std": 0.0142394183203578, + "rewards//mean": 0.770751953125, + "rewards//std": 0.03534434363245964, + "step": 3874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.775, + "grad_norm": 8.00322437286377, + "kl": 1.8746436461806297, + "learning_rate": 1.2233309756650455e-07, + "loss": 0.1875, + "num_tokens": 33490690.0, + "reward": 0.78631591796875, + "reward_std": 0.01489313319325447, + "rewards//mean": 0.78631591796875, + "rewards//std": 0.03283493220806122, + "step": 3875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7752, + "grad_norm": 3.932457685470581, + "kl": 0.6452302783727646, + "learning_rate": 1.2212521282287093e-07, + "loss": 0.0645, + "num_tokens": 33499330.0, + "reward": 0.76300048828125, + "reward_std": 0.002432936569675803, + "rewards//mean": 0.76300048828125, + "rewards//std": 0.030121736228466034, + "step": 3876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7754, + "grad_norm": 8.418670654296875, + "kl": 1.7446348555386066, + "learning_rate": 1.219174802871563e-07, + "loss": 0.1745, + "num_tokens": 33507986.0, + "reward": 0.76287841796875, + "reward_std": 0.009140979498624802, + "rewards//mean": 0.76287841796875, + "rewards//std": 0.03231445699930191, + "step": 3877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7756, + "grad_norm": 5.504981994628906, + "kl": 1.2881504762917757, + "learning_rate": 1.2170990004303566e-07, + "loss": 0.1288, + "num_tokens": 33516562.0, + "reward": 0.74139404296875, + "reward_std": 0.006732706446200609, + "rewards//mean": 0.74139404296875, + "rewards//std": 0.027517516165971756, + "step": 3878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7758, + "grad_norm": 4.007456302642822, + "kl": 1.4617500063031912, + "learning_rate": 1.2150247217412185e-07, + "loss": 0.1462, + "num_tokens": 33525202.0, + "reward": 0.77490234375, + "reward_std": 0.008913259953260422, + "rewards//mean": 0.77490234375, + "rewards//std": 0.017641736194491386, + "step": 3879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.776, + "grad_norm": 4.955577373504639, + "kl": 1.3801674358546734, + "learning_rate": 1.21295196763967e-07, + "loss": 0.138, + "num_tokens": 33533882.0, + "reward": 0.7498779296875, + "reward_std": 0.012024147436022758, + "rewards//mean": 0.7498779296875, + "rewards//std": 0.03324754908680916, + "step": 3880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7762, + "grad_norm": 2.8849432468414307, + "kl": 1.0871131960302591, + "learning_rate": 1.2108807389606158e-07, + "loss": 0.1087, + "num_tokens": 33542466.0, + "reward": 0.77020263671875, + "reward_std": 0.004079015925526619, + "rewards//mean": 0.77020263671875, + "rewards//std": 0.027783581987023354, + "step": 3881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7764, + "grad_norm": 4.26233434677124, + "kl": 1.0907459892332554, + "learning_rate": 1.2088110365383486e-07, + "loss": 0.1091, + "num_tokens": 33551058.0, + "reward": 0.7327880859375, + "reward_std": 0.008394161239266396, + "rewards//mean": 0.7327880859375, + "rewards//std": 0.038942765444517136, + "step": 3882 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7766, + "grad_norm": 4.379446029663086, + "kl": 1.3941702488809824, + "learning_rate": 1.2067428612065406e-07, + "loss": 0.1394, + "num_tokens": 33559810.0, + "reward": 0.76007080078125, + "reward_std": 0.010440127924084663, + "rewards//mean": 0.76007080078125, + "rewards//std": 0.03343522921204567, + "step": 3883 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7768, + "grad_norm": 11.693062782287598, + "kl": 1.5943601075559855, + "learning_rate": 1.2046762137982547e-07, + "loss": 0.1594, + "num_tokens": 33568466.0, + "reward": 0.728759765625, + "reward_std": 0.003640070790424943, + "rewards//mean": 0.728759765625, + "rewards//std": 0.03413825482130051, + "step": 3884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.777, + "grad_norm": 3.1318116188049316, + "kl": 1.7485382184386253, + "learning_rate": 1.202611095145936e-07, + "loss": 0.1749, + "num_tokens": 33577122.0, + "reward": 0.74041748046875, + "reward_std": 0.009169296361505985, + "rewards//mean": 0.74041748046875, + "rewards//std": 0.03209824860095978, + "step": 3885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7772, + "grad_norm": 0.4359985291957855, + "kl": 0.43348943442106247, + "learning_rate": 1.2005475060814156e-07, + "loss": 0.0433, + "num_tokens": 33585786.0, + "reward": 0.72247314453125, + "reward_std": 0.00039866380393505096, + "rewards//mean": 0.72247314453125, + "rewards//std": 0.03195834904909134, + "step": 3886 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7774, + "grad_norm": 2.6584253311157227, + "kl": 1.156630089506507, + "learning_rate": 1.1984854474359042e-07, + "loss": 0.1157, + "num_tokens": 33594426.0, + "reward": 0.71746826171875, + "reward_std": 0.011043723672628403, + "rewards//mean": 0.71746826171875, + "rewards//std": 0.031048260629177094, + "step": 3887 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7776, + "grad_norm": 9.109469413757324, + "kl": 2.788218066096306, + "learning_rate": 1.1964249200399995e-07, + "loss": 0.2788, + "num_tokens": 33603090.0, + "reward": 0.78009033203125, + "reward_std": 0.018154215067625046, + "rewards//mean": 0.78009033203125, + "rewards//std": 0.0390419103205204, + "step": 3888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7778, + "grad_norm": 4.311148166656494, + "kl": 0.9812530316412449, + "learning_rate": 1.1943659247236837e-07, + "loss": 0.0981, + "num_tokens": 33611730.0, + "reward": 0.7464599609375, + "reward_std": 0.004944856744259596, + "rewards//mean": 0.7464599609375, + "rewards//std": 0.03913509473204613, + "step": 3889 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.778, + "grad_norm": 3.466789484024048, + "kl": 1.9767153728753328, + "learning_rate": 1.192308462316317e-07, + "loss": 0.1977, + "num_tokens": 33620370.0, + "reward": 0.71917724609375, + "reward_std": 0.010815571062266827, + "rewards//mean": 0.71917724609375, + "rewards//std": 0.026366254314780235, + "step": 3890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7782, + "grad_norm": 2.2381927967071533, + "kl": 1.0994114875793457, + "learning_rate": 1.1902525336466462e-07, + "loss": 0.1099, + "num_tokens": 33628994.0, + "reward": 0.82061767578125, + "reward_std": 0.012059198692440987, + "rewards//mean": 0.82061767578125, + "rewards//std": 0.025019995868206024, + "step": 3891 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7784, + "grad_norm": 3.7056803703308105, + "kl": 1.3651414718478918, + "learning_rate": 1.1881981395427993e-07, + "loss": 0.1365, + "num_tokens": 33637618.0, + "reward": 0.77862548828125, + "reward_std": 0.015710029751062393, + "rewards//mean": 0.77862548828125, + "rewards//std": 0.02837444841861725, + "step": 3892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7786, + "grad_norm": 5.062930583953857, + "kl": 1.620561370626092, + "learning_rate": 1.1861452808322874e-07, + "loss": 0.1621, + "num_tokens": 33646322.0, + "reward": 0.72125244140625, + "reward_std": 0.009189036674797535, + "rewards//mean": 0.72125244140625, + "rewards//std": 0.04130098596215248, + "step": 3893 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7788, + "grad_norm": 2.8955702781677246, + "kl": 1.3703166246414185, + "learning_rate": 1.1840939583419984e-07, + "loss": 0.137, + "num_tokens": 33655018.0, + "reward": 0.7529296875, + "reward_std": 0.009729741141200066, + "rewards//mean": 0.7529296875, + "rewards//std": 0.024920042604207993, + "step": 3894 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.779, + "grad_norm": 2.376582622528076, + "kl": 1.8151601385325193, + "learning_rate": 1.1820441728982072e-07, + "loss": 0.1815, + "num_tokens": 33663618.0, + "reward": 0.74322509765625, + "reward_std": 0.010157903656363487, + "rewards//mean": 0.74322509765625, + "rewards//std": 0.03327866643667221, + "step": 3895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7792, + "grad_norm": 4.464691638946533, + "kl": 1.1559271439909935, + "learning_rate": 1.1799959253265668e-07, + "loss": 0.1156, + "num_tokens": 33672242.0, + "reward": 0.76544189453125, + "reward_std": 0.0035160277038812637, + "rewards//mean": 0.76544189453125, + "rewards//std": 0.02802553027868271, + "step": 3896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7794, + "grad_norm": 4.394137382507324, + "kl": 0.9982554074376822, + "learning_rate": 1.1779492164521116e-07, + "loss": 0.0998, + "num_tokens": 33680818.0, + "reward": 0.7318115234375, + "reward_std": 0.007153951562941074, + "rewards//mean": 0.7318115234375, + "rewards//std": 0.03395574167370796, + "step": 3897 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7796, + "grad_norm": 1.8784767389297485, + "kl": 1.255854481831193, + "learning_rate": 1.1759040470992537e-07, + "loss": 0.1256, + "num_tokens": 33689506.0, + "reward": 0.75634765625, + "reward_std": 0.007995720952749252, + "rewards//mean": 0.75634765625, + "rewards//std": 0.02862468920648098, + "step": 3898 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7798, + "grad_norm": 3.370797872543335, + "kl": 1.6317807789891958, + "learning_rate": 1.1738604180917888e-07, + "loss": 0.1632, + "num_tokens": 33698194.0, + "reward": 0.72930908203125, + "reward_std": 0.015105238184332848, + "rewards//mean": 0.72930908203125, + "rewards//std": 0.03501252084970474, + "step": 3899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.78, + "grad_norm": 3.2768189907073975, + "kl": 0.887617751955986, + "learning_rate": 1.1718183302528895e-07, + "loss": 0.0888, + "num_tokens": 33706770.0, + "reward": 0.73199462890625, + "reward_std": 0.00544398557394743, + "rewards//mean": 0.73199462890625, + "rewards//std": 0.01615269109606743, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7802, + "grad_norm": 3.469325065612793, + "kl": 1.2113143410533667, + "learning_rate": 1.1697777844051104e-07, + "loss": 0.1211, + "num_tokens": 33715346.0, + "reward": 0.777099609375, + "reward_std": 0.011110441759228706, + "rewards//mean": 0.777099609375, + "rewards//std": 0.028175892308354378, + "step": 3901 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7804, + "grad_norm": 4.458643913269043, + "kl": 0.866313936188817, + "learning_rate": 1.1677387813703804e-07, + "loss": 0.0866, + "num_tokens": 33724018.0, + "reward": 0.7210693359375, + "reward_std": 0.003817690769210458, + "rewards//mean": 0.7210693359375, + "rewards//std": 0.029336893931031227, + "step": 3902 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7806, + "grad_norm": 7.73784875869751, + "kl": 2.0586373265832663, + "learning_rate": 1.1657013219700106e-07, + "loss": 0.2059, + "num_tokens": 33732706.0, + "reward": 0.73193359375, + "reward_std": 0.011666069738566875, + "rewards//mean": 0.73193359375, + "rewards//std": 0.04235561192035675, + "step": 3903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7808, + "grad_norm": 7.406689643859863, + "kl": 1.872145613655448, + "learning_rate": 1.1636654070246904e-07, + "loss": 0.1872, + "num_tokens": 33741338.0, + "reward": 0.713134765625, + "reward_std": 0.013348628766834736, + "rewards//mean": 0.713134765625, + "rewards//std": 0.04699820280075073, + "step": 3904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.781, + "grad_norm": 4.445286273956299, + "kl": 0.6884522791951895, + "learning_rate": 1.1616310373544863e-07, + "loss": 0.0688, + "num_tokens": 33749970.0, + "reward": 0.76763916015625, + "reward_std": 0.002589502139016986, + "rewards//mean": 0.76763916015625, + "rewards//std": 0.027752509340643883, + "step": 3905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7812, + "grad_norm": 1.7538880109786987, + "kl": 1.1617361009120941, + "learning_rate": 1.1595982137788402e-07, + "loss": 0.1162, + "num_tokens": 33758546.0, + "reward": 0.77044677734375, + "reward_std": 0.011654814705252647, + "rewards//mean": 0.77044677734375, + "rewards//std": 0.030597908422350883, + "step": 3906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7814, + "grad_norm": 10.20732307434082, + "kl": 2.338462717831135, + "learning_rate": 1.1575669371165748e-07, + "loss": 0.2338, + "num_tokens": 33767218.0, + "reward": 0.76385498046875, + "reward_std": 0.012012193910777569, + "rewards//mean": 0.76385498046875, + "rewards//std": 0.03739580512046814, + "step": 3907 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7816, + "grad_norm": 3.7461202144622803, + "kl": 1.4943180810660124, + "learning_rate": 1.1555372081858883e-07, + "loss": 0.1494, + "num_tokens": 33775794.0, + "reward": 0.74871826171875, + "reward_std": 0.01357128843665123, + "rewards//mean": 0.74871826171875, + "rewards//std": 0.042054641991853714, + "step": 3908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7818, + "grad_norm": 3.682917356491089, + "kl": 1.4658153858035803, + "learning_rate": 1.1535090278043535e-07, + "loss": 0.1466, + "num_tokens": 33784490.0, + "reward": 0.75103759765625, + "reward_std": 0.0073267389088869095, + "rewards//mean": 0.75103759765625, + "rewards//std": 0.02775469236075878, + "step": 3909 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.782, + "grad_norm": 8.808707237243652, + "kl": 1.959876213222742, + "learning_rate": 1.151482396788922e-07, + "loss": 0.196, + "num_tokens": 33793106.0, + "reward": 0.73638916015625, + "reward_std": 0.010882798582315445, + "rewards//mean": 0.73638916015625, + "rewards//std": 0.03498007729649544, + "step": 3910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7822, + "grad_norm": 4.275513172149658, + "kl": 3.1317162178456783, + "learning_rate": 1.1494573159559212e-07, + "loss": 0.3132, + "num_tokens": 33801746.0, + "reward": 0.7437744140625, + "reward_std": 0.020146049559116364, + "rewards//mean": 0.7437744140625, + "rewards//std": 0.04158356785774231, + "step": 3911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7824, + "grad_norm": 4.286827087402344, + "kl": 0.8704972639679909, + "learning_rate": 1.1474337861210543e-07, + "loss": 0.087, + "num_tokens": 33810314.0, + "reward": 0.7440185546875, + "reward_std": 0.006640148349106312, + "rewards//mean": 0.7440185546875, + "rewards//std": 0.02553419955074787, + "step": 3912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7826, + "grad_norm": 12.594411849975586, + "kl": 1.8635885156691074, + "learning_rate": 1.1454118080993963e-07, + "loss": 0.1864, + "num_tokens": 33819010.0, + "reward": 0.7666015625, + "reward_std": 0.0051859281957149506, + "rewards//mean": 0.7666015625, + "rewards//std": 0.028923505917191505, + "step": 3913 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7828, + "grad_norm": 1.4292763471603394, + "kl": 1.3147436883300543, + "learning_rate": 1.1433913827054009e-07, + "loss": 0.1315, + "num_tokens": 33827674.0, + "reward": 0.762939453125, + "reward_std": 0.010203614830970764, + "rewards//mean": 0.762939453125, + "rewards//std": 0.0316188782453537, + "step": 3914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.783, + "grad_norm": 5.7144060134887695, + "kl": 2.7994453199207783, + "learning_rate": 1.1413725107528954e-07, + "loss": 0.2799, + "num_tokens": 33836434.0, + "reward": 0.751220703125, + "reward_std": 0.015385335311293602, + "rewards//mean": 0.751220703125, + "rewards//std": 0.03369550779461861, + "step": 3915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7832, + "grad_norm": 5.905735015869141, + "kl": 1.4288552645593882, + "learning_rate": 1.1393551930550826e-07, + "loss": 0.1429, + "num_tokens": 33845066.0, + "reward": 0.777099609375, + "reward_std": 0.008961888030171394, + "rewards//mean": 0.777099609375, + "rewards//std": 0.030256683006882668, + "step": 3916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7834, + "grad_norm": 1.539586067199707, + "kl": 0.9469500109553337, + "learning_rate": 1.1373394304245349e-07, + "loss": 0.0947, + "num_tokens": 33853754.0, + "reward": 0.7396240234375, + "reward_std": 0.004714501090347767, + "rewards//mean": 0.7396240234375, + "rewards//std": 0.024922169744968414, + "step": 3917 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7836, + "grad_norm": 1.9085220098495483, + "kl": 1.47810672596097, + "learning_rate": 1.135325223673203e-07, + "loss": 0.1478, + "num_tokens": 33862450.0, + "reward": 0.73687744140625, + "reward_std": 0.006917201913893223, + "rewards//mean": 0.73687744140625, + "rewards//std": 0.03307516500353813, + "step": 3918 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7838, + "grad_norm": 3.2357144355773926, + "kl": 1.6053892597556114, + "learning_rate": 1.1333125736124083e-07, + "loss": 0.1605, + "num_tokens": 33871090.0, + "reward": 0.75054931640625, + "reward_std": 0.009515912272036076, + "rewards//mean": 0.75054931640625, + "rewards//std": 0.039234910160303116, + "step": 3919 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.784, + "grad_norm": 2.0340006351470947, + "kl": 0.7867747042328119, + "learning_rate": 1.1313014810528482e-07, + "loss": 0.0787, + "num_tokens": 33879682.0, + "reward": 0.7442626953125, + "reward_std": 0.004143203608691692, + "rewards//mean": 0.7442626953125, + "rewards//std": 0.03418146073818207, + "step": 3920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7842, + "grad_norm": 5.448980808258057, + "kl": 0.9573967736214399, + "learning_rate": 1.1292919468045875e-07, + "loss": 0.0957, + "num_tokens": 33888290.0, + "reward": 0.756103515625, + "reward_std": 0.005698992405086756, + "rewards//mean": 0.756103515625, + "rewards//std": 0.030288685113191605, + "step": 3921 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7844, + "grad_norm": 9.918346405029297, + "kl": 1.9798985235393047, + "learning_rate": 1.1272839716770677e-07, + "loss": 0.198, + "num_tokens": 33896874.0, + "reward": 0.743408203125, + "reward_std": 0.008350122720003128, + "rewards//mean": 0.743408203125, + "rewards//std": 0.0358072929084301, + "step": 3922 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7846, + "grad_norm": 4.997101306915283, + "kl": 2.2874083314090967, + "learning_rate": 1.1252775564791023e-07, + "loss": 0.2287, + "num_tokens": 33905554.0, + "reward": 0.74530029296875, + "reward_std": 0.01806064508855343, + "rewards//mean": 0.74530029296875, + "rewards//std": 0.028832755982875824, + "step": 3923 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7848, + "grad_norm": 4.5868377685546875, + "kl": 2.022704156115651, + "learning_rate": 1.1232727020188726e-07, + "loss": 0.2023, + "num_tokens": 33914258.0, + "reward": 0.7747802734375, + "reward_std": 0.01081641111522913, + "rewards//mean": 0.7747802734375, + "rewards//std": 0.03592441603541374, + "step": 3924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.785, + "grad_norm": 3.338491201400757, + "kl": 0.8731247782707214, + "learning_rate": 1.1212694091039349e-07, + "loss": 0.0873, + "num_tokens": 33922922.0, + "reward": 0.75982666015625, + "reward_std": 0.006358280312269926, + "rewards//mean": 0.75982666015625, + "rewards//std": 0.02945449762046337, + "step": 3925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7852, + "grad_norm": 3.4145405292510986, + "kl": 0.9957703575491905, + "learning_rate": 1.1192676785412152e-07, + "loss": 0.0996, + "num_tokens": 33931514.0, + "reward": 0.74609375, + "reward_std": 0.007620078045874834, + "rewards//mean": 0.74609375, + "rewards//std": 0.03124224953353405, + "step": 3926 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7854, + "grad_norm": 0.7550323605537415, + "kl": 0.8773941099643707, + "learning_rate": 1.1172675111370122e-07, + "loss": 0.0877, + "num_tokens": 33940082.0, + "reward": 0.71905517578125, + "reward_std": 0.004137867130339146, + "rewards//mean": 0.71905517578125, + "rewards//std": 0.031345222145318985, + "step": 3927 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7856, + "grad_norm": 4.324525356292725, + "kl": 1.4621629230678082, + "learning_rate": 1.1152689076969896e-07, + "loss": 0.1462, + "num_tokens": 33948650.0, + "reward": 0.7552490234375, + "reward_std": 0.0105473343282938, + "rewards//mean": 0.7552490234375, + "rewards//std": 0.033416491001844406, + "step": 3928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7858, + "grad_norm": 7.7892937660217285, + "kl": 1.0337713100016117, + "learning_rate": 1.1132718690261867e-07, + "loss": 0.1034, + "num_tokens": 33957282.0, + "reward": 0.76287841796875, + "reward_std": 0.0029281843453645706, + "rewards//mean": 0.76287841796875, + "rewards//std": 0.029827818274497986, + "step": 3929 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.786, + "grad_norm": 7.077126979827881, + "kl": 2.192488122731447, + "learning_rate": 1.11127639592901e-07, + "loss": 0.2192, + "num_tokens": 33966170.0, + "reward": 0.7769775390625, + "reward_std": 0.013575734570622444, + "rewards//mean": 0.7769775390625, + "rewards//std": 0.038992490619421005, + "step": 3930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7862, + "grad_norm": 0.7073388695716858, + "kl": 0.7025747690349817, + "learning_rate": 1.1092824892092373e-07, + "loss": 0.0703, + "num_tokens": 33974778.0, + "reward": 0.79315185546875, + "reward_std": 0.005577668081969023, + "rewards//mean": 0.79315185546875, + "rewards//std": 0.023182202130556107, + "step": 3931 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7864, + "grad_norm": 3.4573216438293457, + "kl": 1.4150816332548857, + "learning_rate": 1.107290149670011e-07, + "loss": 0.1415, + "num_tokens": 33983338.0, + "reward": 0.74737548828125, + "reward_std": 0.00965961068868637, + "rewards//mean": 0.74737548828125, + "rewards//std": 0.03508421406149864, + "step": 3932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7866, + "grad_norm": 5.318392753601074, + "kl": 1.5157778076827526, + "learning_rate": 1.1052993781138475e-07, + "loss": 0.1516, + "num_tokens": 33991938.0, + "reward": 0.76861572265625, + "reward_std": 0.006830888334661722, + "rewards//mean": 0.76861572265625, + "rewards//std": 0.025655075907707214, + "step": 3933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7868, + "grad_norm": 4.865805149078369, + "kl": 1.708130832761526, + "learning_rate": 1.1033101753426282e-07, + "loss": 0.1708, + "num_tokens": 34000714.0, + "reward": 0.7745361328125, + "reward_std": 0.006780258379876614, + "rewards//mean": 0.7745361328125, + "rewards//std": 0.023056896403431892, + "step": 3934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.787, + "grad_norm": 1.7207118272781372, + "kl": 1.159225095063448, + "learning_rate": 1.1013225421576078e-07, + "loss": 0.1159, + "num_tokens": 34009330.0, + "reward": 0.737548828125, + "reward_std": 0.006382783874869347, + "rewards//mean": 0.737548828125, + "rewards//std": 0.026161137968301773, + "step": 3935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7872, + "grad_norm": 3.8666341304779053, + "kl": 1.6811864990741014, + "learning_rate": 1.0993364793593979e-07, + "loss": 0.1681, + "num_tokens": 34017986.0, + "reward": 0.78369140625, + "reward_std": 0.01143318135291338, + "rewards//mean": 0.78369140625, + "rewards//std": 0.023370232433080673, + "step": 3936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7874, + "grad_norm": 3.597184419631958, + "kl": 1.8087128233164549, + "learning_rate": 1.0973519877479876e-07, + "loss": 0.1809, + "num_tokens": 34026658.0, + "reward": 0.772705078125, + "reward_std": 0.017702333629131317, + "rewards//mean": 0.772705078125, + "rewards//std": 0.027977483347058296, + "step": 3937 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7876, + "grad_norm": 3.6767613887786865, + "kl": 1.126314539462328, + "learning_rate": 1.09536906812273e-07, + "loss": 0.1126, + "num_tokens": 34035274.0, + "reward": 0.697998046875, + "reward_std": 0.004239916801452637, + "rewards//mean": 0.697998046875, + "rewards//std": 0.031046859920024872, + "step": 3938 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7878, + "grad_norm": 11.522296905517578, + "kl": 2.3709292765706778, + "learning_rate": 1.0933877212823461e-07, + "loss": 0.2371, + "num_tokens": 34043834.0, + "reward": 0.76531982421875, + "reward_std": 0.0046653407625854015, + "rewards//mean": 0.76531982421875, + "rewards//std": 0.02718210406601429, + "step": 3939 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.788, + "grad_norm": 2.5910301208496094, + "kl": 1.7482000272721052, + "learning_rate": 1.0914079480249194e-07, + "loss": 0.1748, + "num_tokens": 34052530.0, + "reward": 0.77337646484375, + "reward_std": 0.013549950905144215, + "rewards//mean": 0.77337646484375, + "rewards//std": 0.028267011046409607, + "step": 3940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7882, + "grad_norm": 3.0972018241882324, + "kl": 1.0658121649175882, + "learning_rate": 1.0894297491479043e-07, + "loss": 0.1066, + "num_tokens": 34061074.0, + "reward": 0.751708984375, + "reward_std": 0.005103100091218948, + "rewards//mean": 0.751708984375, + "rewards//std": 0.030063949525356293, + "step": 3941 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7884, + "grad_norm": 1.1352661848068237, + "kl": 0.5813736002892256, + "learning_rate": 1.0874531254481184e-07, + "loss": 0.0581, + "num_tokens": 34069690.0, + "reward": 0.74169921875, + "reward_std": 0.0015413069631904364, + "rewards//mean": 0.74169921875, + "rewards//std": 0.026924218982458115, + "step": 3942 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7886, + "grad_norm": 9.895622253417969, + "kl": 2.5029298197478056, + "learning_rate": 1.0854780777217465e-07, + "loss": 0.2503, + "num_tokens": 34078338.0, + "reward": 0.75543212890625, + "reward_std": 0.007043184246867895, + "rewards//mean": 0.75543212890625, + "rewards//std": 0.027618002146482468, + "step": 3943 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7888, + "grad_norm": 3.7129220962524414, + "kl": 1.5188702084124088, + "learning_rate": 1.083504606764336e-07, + "loss": 0.1519, + "num_tokens": 34086994.0, + "reward": 0.7633056640625, + "reward_std": 0.008930771611630917, + "rewards//mean": 0.7633056640625, + "rewards//std": 0.031022226437926292, + "step": 3944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.789, + "grad_norm": 6.922622203826904, + "kl": 2.197903783991933, + "learning_rate": 1.0815327133708013e-07, + "loss": 0.2198, + "num_tokens": 34095586.0, + "reward": 0.748291015625, + "reward_std": 0.014078168198466301, + "rewards//mean": 0.748291015625, + "rewards//std": 0.040354833006858826, + "step": 3945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7892, + "grad_norm": 1.8126484155654907, + "kl": 1.3330847918987274, + "learning_rate": 1.0795623983354213e-07, + "loss": 0.1333, + "num_tokens": 34104258.0, + "reward": 0.77789306640625, + "reward_std": 0.00931274052709341, + "rewards//mean": 0.77789306640625, + "rewards//std": 0.03176402673125267, + "step": 3946 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7894, + "grad_norm": 1.7259939908981323, + "kl": 0.5841523483395576, + "learning_rate": 1.0775936624518395e-07, + "loss": 0.0584, + "num_tokens": 34112930.0, + "reward": 0.7816162109375, + "reward_std": 0.0029154345393180847, + "rewards//mean": 0.7816162109375, + "rewards//std": 0.019372105598449707, + "step": 3947 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7896, + "grad_norm": 3.633294105529785, + "kl": 1.590855523943901, + "learning_rate": 1.0756265065130604e-07, + "loss": 0.1591, + "num_tokens": 34121634.0, + "reward": 0.7408447265625, + "reward_std": 0.01404031366109848, + "rewards//mean": 0.7408447265625, + "rewards//std": 0.03694154694676399, + "step": 3948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7898, + "grad_norm": 2.123703956604004, + "kl": 0.731162590906024, + "learning_rate": 1.0736609313114548e-07, + "loss": 0.0731, + "num_tokens": 34130178.0, + "reward": 0.76025390625, + "reward_std": 0.006418525241315365, + "rewards//mean": 0.76025390625, + "rewards//std": 0.024384593591094017, + "step": 3949 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.79, + "grad_norm": 4.00430154800415, + "kl": 1.1961615476757288, + "learning_rate": 1.0716969376387563e-07, + "loss": 0.1196, + "num_tokens": 34138890.0, + "reward": 0.722412109375, + "reward_std": 0.007393763400614262, + "rewards//mean": 0.722412109375, + "rewards//std": 0.03259948268532753, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7902, + "grad_norm": 2.4965059757232666, + "kl": 1.591790109872818, + "learning_rate": 1.0697345262860635e-07, + "loss": 0.1592, + "num_tokens": 34147546.0, + "reward": 0.74908447265625, + "reward_std": 0.00894995778799057, + "rewards//mean": 0.74908447265625, + "rewards//std": 0.026536794379353523, + "step": 3951 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7904, + "grad_norm": 2.9501092433929443, + "kl": 1.0904130283743143, + "learning_rate": 1.0677736980438318e-07, + "loss": 0.109, + "num_tokens": 34156226.0, + "reward": 0.78021240234375, + "reward_std": 0.007875943556427956, + "rewards//mean": 0.78021240234375, + "rewards//std": 0.02814464643597603, + "step": 3952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7906, + "grad_norm": 4.884478569030762, + "kl": 1.6771136969327927, + "learning_rate": 1.0658144537018842e-07, + "loss": 0.1677, + "num_tokens": 34164898.0, + "reward": 0.7530517578125, + "reward_std": 0.011819196864962578, + "rewards//mean": 0.7530517578125, + "rewards//std": 0.029227297753095627, + "step": 3953 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7908, + "grad_norm": 2.8198182582855225, + "kl": 0.9743771161884069, + "learning_rate": 1.0638567940494059e-07, + "loss": 0.0974, + "num_tokens": 34173530.0, + "reward": 0.7703857421875, + "reward_std": 0.007821903564035892, + "rewards//mean": 0.7703857421875, + "rewards//std": 0.03211740404367447, + "step": 3954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.791, + "grad_norm": 7.733407974243164, + "kl": 2.2402735631912947, + "learning_rate": 1.0619007198749386e-07, + "loss": 0.224, + "num_tokens": 34182138.0, + "reward": 0.7706298828125, + "reward_std": 0.014774031937122345, + "rewards//mean": 0.7706298828125, + "rewards//std": 0.03309055417776108, + "step": 3955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7912, + "grad_norm": 2.511195659637451, + "kl": 1.426751671358943, + "learning_rate": 1.0599462319663904e-07, + "loss": 0.1427, + "num_tokens": 34190722.0, + "reward": 0.8026123046875, + "reward_std": 0.010879311710596085, + "rewards//mean": 0.8026123046875, + "rewards//std": 0.02789159305393696, + "step": 3956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7914, + "grad_norm": 4.181887626647949, + "kl": 0.9340474139899015, + "learning_rate": 1.057993331111029e-07, + "loss": 0.0934, + "num_tokens": 34199338.0, + "reward": 0.75335693359375, + "reward_std": 0.005688315723091364, + "rewards//mean": 0.75335693359375, + "rewards//std": 0.026801303029060364, + "step": 3957 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7916, + "grad_norm": 12.510884284973145, + "kl": 2.1339222714304924, + "learning_rate": 1.0560420180954838e-07, + "loss": 0.2134, + "num_tokens": 34208026.0, + "reward": 0.7318115234375, + "reward_std": 0.0069226669147610664, + "rewards//mean": 0.7318115234375, + "rewards//std": 0.025810157880187035, + "step": 3958 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7918, + "grad_norm": 11.036775588989258, + "kl": 1.517913432791829, + "learning_rate": 1.0540922937057405e-07, + "loss": 0.1518, + "num_tokens": 34216666.0, + "reward": 0.75384521484375, + "reward_std": 0.006471584551036358, + "rewards//mean": 0.75384521484375, + "rewards//std": 0.040406517684459686, + "step": 3959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.792, + "grad_norm": 2.6912834644317627, + "kl": 1.4423391576856375, + "learning_rate": 1.0521441587271496e-07, + "loss": 0.1442, + "num_tokens": 34225330.0, + "reward": 0.76715087890625, + "reward_std": 0.010383655317127705, + "rewards//mean": 0.76715087890625, + "rewards//std": 0.03179260715842247, + "step": 3960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7922, + "grad_norm": 2.889589309692383, + "kl": 1.2809564415365458, + "learning_rate": 1.0501976139444191e-07, + "loss": 0.1281, + "num_tokens": 34233994.0, + "reward": 0.76556396484375, + "reward_std": 0.01011392381042242, + "rewards//mean": 0.76556396484375, + "rewards//std": 0.03066709265112877, + "step": 3961 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7924, + "grad_norm": 2.752598524093628, + "kl": 0.9556526094675064, + "learning_rate": 1.0482526601416186e-07, + "loss": 0.0956, + "num_tokens": 34242666.0, + "reward": 0.78399658203125, + "reward_std": 0.008804308250546455, + "rewards//mean": 0.78399658203125, + "rewards//std": 0.028962666168808937, + "step": 3962 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7926, + "grad_norm": 3.3719711303710938, + "kl": 1.550707921385765, + "learning_rate": 1.0463092981021732e-07, + "loss": 0.1551, + "num_tokens": 34251442.0, + "reward": 0.77032470703125, + "reward_std": 0.009393202140927315, + "rewards//mean": 0.77032470703125, + "rewards//std": 0.03473122790455818, + "step": 3963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7928, + "grad_norm": 5.523403167724609, + "kl": 1.593555772677064, + "learning_rate": 1.0443675286088694e-07, + "loss": 0.1594, + "num_tokens": 34260170.0, + "reward": 0.74957275390625, + "reward_std": 0.0025972009170800447, + "rewards//mean": 0.74957275390625, + "rewards//std": 0.03260128200054169, + "step": 3964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.793, + "grad_norm": 3.1733345985412598, + "kl": 1.5834615100175142, + "learning_rate": 1.042427352443852e-07, + "loss": 0.1583, + "num_tokens": 34268778.0, + "reward": 0.75897216796875, + "reward_std": 0.011987416073679924, + "rewards//mean": 0.75897216796875, + "rewards//std": 0.0337587371468544, + "step": 3965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7932, + "grad_norm": 11.371508598327637, + "kl": 1.6594927590340376, + "learning_rate": 1.040488770388625e-07, + "loss": 0.1659, + "num_tokens": 34277482.0, + "reward": 0.7677001953125, + "reward_std": 0.010774901136755943, + "rewards//mean": 0.7677001953125, + "rewards//std": 0.0317266546189785, + "step": 3966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7934, + "grad_norm": 16.085100173950195, + "kl": 3.6442190278321505, + "learning_rate": 1.038551783224047e-07, + "loss": 0.3644, + "num_tokens": 34286266.0, + "reward": 0.7669677734375, + "reward_std": 0.02363688126206398, + "rewards//mean": 0.7669677734375, + "rewards//std": 0.0427694171667099, + "step": 3967 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7936, + "grad_norm": 3.4335145950317383, + "kl": 1.7057804800570011, + "learning_rate": 1.0366163917303367e-07, + "loss": 0.1706, + "num_tokens": 34294890.0, + "reward": 0.75311279296875, + "reward_std": 0.003995015751570463, + "rewards//mean": 0.75311279296875, + "rewards//std": 0.019340332597494125, + "step": 3968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7938, + "grad_norm": 2.2967724800109863, + "kl": 1.2740641385316849, + "learning_rate": 1.034682596687071e-07, + "loss": 0.1274, + "num_tokens": 34303626.0, + "reward": 0.768798828125, + "reward_std": 0.005805861204862595, + "rewards//mean": 0.768798828125, + "rewards//std": 0.03672228008508682, + "step": 3969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.794, + "grad_norm": 3.043201446533203, + "kl": 2.123381871730089, + "learning_rate": 1.0327503988731795e-07, + "loss": 0.2123, + "num_tokens": 34312258.0, + "reward": 0.76605224609375, + "reward_std": 0.012116423808038235, + "rewards//mean": 0.76605224609375, + "rewards//std": 0.032593853771686554, + "step": 3970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7942, + "grad_norm": 1.3511651754379272, + "kl": 0.9702730886638165, + "learning_rate": 1.0308197990669537e-07, + "loss": 0.097, + "num_tokens": 34320858.0, + "reward": 0.7637939453125, + "reward_std": 0.00715398695319891, + "rewards//mean": 0.7637939453125, + "rewards//std": 0.03070242702960968, + "step": 3971 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7944, + "grad_norm": 0.9036258459091187, + "kl": 0.7749298624694347, + "learning_rate": 1.0288907980460377e-07, + "loss": 0.0775, + "num_tokens": 34329554.0, + "reward": 0.77862548828125, + "reward_std": 0.003989311866462231, + "rewards//mean": 0.77862548828125, + "rewards//std": 0.02178863063454628, + "step": 3972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7946, + "grad_norm": 3.5450775623321533, + "kl": 0.962246784940362, + "learning_rate": 1.0269633965874347e-07, + "loss": 0.0962, + "num_tokens": 34338082.0, + "reward": 0.78131103515625, + "reward_std": 0.005833945702761412, + "rewards//mean": 0.78131103515625, + "rewards//std": 0.022124387323856354, + "step": 3973 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7948, + "grad_norm": 3.3692398071289062, + "kl": 0.7408181671053171, + "learning_rate": 1.025037595467499e-07, + "loss": 0.0741, + "num_tokens": 34346786.0, + "reward": 0.7210693359375, + "reward_std": 0.004162712953984737, + "rewards//mean": 0.7210693359375, + "rewards//std": 0.036022040992975235, + "step": 3974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.795, + "grad_norm": 9.130332946777344, + "kl": 1.1871028784662485, + "learning_rate": 1.0231133954619447e-07, + "loss": 0.1187, + "num_tokens": 34355442.0, + "reward": 0.728271484375, + "reward_std": 0.006527550518512726, + "rewards//mean": 0.728271484375, + "rewards//std": 0.02998328022658825, + "step": 3975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7952, + "grad_norm": 3.2529711723327637, + "kl": 1.373081335797906, + "learning_rate": 1.021190797345839e-07, + "loss": 0.1373, + "num_tokens": 34364010.0, + "reward": 0.7794189453125, + "reward_std": 0.008333644829690456, + "rewards//mean": 0.7794189453125, + "rewards//std": 0.03868849202990532, + "step": 3976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7954, + "grad_norm": 1.0868194103240967, + "kl": 1.0680901762098074, + "learning_rate": 1.0192698018936058e-07, + "loss": 0.1068, + "num_tokens": 34372610.0, + "reward": 0.7899169921875, + "reward_std": 0.004978492856025696, + "rewards//mean": 0.7899169921875, + "rewards//std": 0.020954687148332596, + "step": 3977 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7956, + "grad_norm": 2.964296579360962, + "kl": 1.7145035937428474, + "learning_rate": 1.0173504098790186e-07, + "loss": 0.1715, + "num_tokens": 34381242.0, + "reward": 0.7640380859375, + "reward_std": 0.019781235605478287, + "rewards//mean": 0.7640380859375, + "rewards//std": 0.043873608112335205, + "step": 3978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7958, + "grad_norm": 16.35740089416504, + "kl": 2.6129555255174637, + "learning_rate": 1.0154326220752107e-07, + "loss": 0.2613, + "num_tokens": 34389962.0, + "reward": 0.73419189453125, + "reward_std": 0.00733737088739872, + "rewards//mean": 0.73419189453125, + "rewards//std": 0.03942312300205231, + "step": 3979 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.796, + "grad_norm": 8.994636535644531, + "kl": 1.4033876154571772, + "learning_rate": 1.0135164392546658e-07, + "loss": 0.1403, + "num_tokens": 34398522.0, + "reward": 0.76947021484375, + "reward_std": 0.0029546150472015142, + "rewards//mean": 0.76947021484375, + "rewards//std": 0.024798179045319557, + "step": 3980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7962, + "grad_norm": 6.220097541809082, + "kl": 1.122046809643507, + "learning_rate": 1.0116018621892236e-07, + "loss": 0.1122, + "num_tokens": 34407082.0, + "reward": 0.78009033203125, + "reward_std": 0.008151261135935783, + "rewards//mean": 0.78009033203125, + "rewards//std": 0.02932831272482872, + "step": 3981 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7964, + "grad_norm": 13.410201072692871, + "kl": 2.444033235311508, + "learning_rate": 1.0096888916500734e-07, + "loss": 0.2444, + "num_tokens": 34415818.0, + "reward": 0.7667236328125, + "reward_std": 0.012539811432361603, + "rewards//mean": 0.7667236328125, + "rewards//std": 0.03412117809057236, + "step": 3982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7966, + "grad_norm": 8.044370651245117, + "kl": 2.664042577147484, + "learning_rate": 1.00777752840776e-07, + "loss": 0.2664, + "num_tokens": 34424386.0, + "reward": 0.739990234375, + "reward_std": 0.012422893196344376, + "rewards//mean": 0.739990234375, + "rewards//std": 0.028381450101733208, + "step": 3983 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7968, + "grad_norm": 1.248191237449646, + "kl": 0.9969297703355551, + "learning_rate": 1.0058677732321824e-07, + "loss": 0.0997, + "num_tokens": 34433090.0, + "reward": 0.736328125, + "reward_std": 0.002992020919919014, + "rewards//mean": 0.736328125, + "rewards//std": 0.03411785140633583, + "step": 3984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.797, + "grad_norm": 6.569794178009033, + "kl": 1.5086831990629435, + "learning_rate": 1.0039596268925865e-07, + "loss": 0.1509, + "num_tokens": 34441850.0, + "reward": 0.70654296875, + "reward_std": 0.0060121589340269566, + "rewards//mean": 0.70654296875, + "rewards//std": 0.03396487981081009, + "step": 3985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7972, + "grad_norm": 2.8830041885375977, + "kl": 0.9853732716292143, + "learning_rate": 1.0020530901575752e-07, + "loss": 0.0985, + "num_tokens": 34450458.0, + "reward": 0.78009033203125, + "reward_std": 0.007049919106066227, + "rewards//mean": 0.78009033203125, + "rewards//std": 0.022038694471120834, + "step": 3986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7974, + "grad_norm": 5.60346794128418, + "kl": 0.8125039637088776, + "learning_rate": 1.0001481637951009e-07, + "loss": 0.0813, + "num_tokens": 34459106.0, + "reward": 0.75482177734375, + "reward_std": 0.006308666430413723, + "rewards//mean": 0.75482177734375, + "rewards//std": 0.028320513665676117, + "step": 3987 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7976, + "grad_norm": 4.042594909667969, + "kl": 0.7735357247292995, + "learning_rate": 9.982448485724692e-08, + "loss": 0.0774, + "num_tokens": 34467786.0, + "reward": 0.73272705078125, + "reward_std": 0.006879073567688465, + "rewards//mean": 0.73272705078125, + "rewards//std": 0.030248112976551056, + "step": 3988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7978, + "grad_norm": 3.227397918701172, + "kl": 1.5894843433052301, + "learning_rate": 9.963431452563331e-08, + "loss": 0.1589, + "num_tokens": 34476426.0, + "reward": 0.74591064453125, + "reward_std": 0.006713111884891987, + "rewards//mean": 0.74591064453125, + "rewards//std": 0.0261477530002594, + "step": 3989 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.798, + "grad_norm": 4.341818809509277, + "kl": 0.8219067975878716, + "learning_rate": 9.944430546126987e-08, + "loss": 0.0822, + "num_tokens": 34485026.0, + "reward": 0.74176025390625, + "reward_std": 0.006621167995035648, + "rewards//mean": 0.74176025390625, + "rewards//std": 0.0295867957174778, + "step": 3990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7982, + "grad_norm": 6.1545090675354, + "kl": 1.2499432228505611, + "learning_rate": 9.92544577406923e-08, + "loss": 0.125, + "num_tokens": 34493650.0, + "reward": 0.773681640625, + "reward_std": 0.004224213771522045, + "rewards//mean": 0.773681640625, + "rewards//std": 0.025304628536105156, + "step": 3991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7984, + "grad_norm": 3.627901077270508, + "kl": 1.1244537066668272, + "learning_rate": 9.90647714403714e-08, + "loss": 0.1124, + "num_tokens": 34502226.0, + "reward": 0.76910400390625, + "reward_std": 0.011032147333025932, + "rewards//mean": 0.76910400390625, + "rewards//std": 0.035352855920791626, + "step": 3992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7986, + "grad_norm": 2.111823797225952, + "kl": 1.4076343681663275, + "learning_rate": 9.887524663671243e-08, + "loss": 0.1408, + "num_tokens": 34510834.0, + "reward": 0.77862548828125, + "reward_std": 0.00782176572829485, + "rewards//mean": 0.77862548828125, + "rewards//std": 0.02257480099797249, + "step": 3993 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7988, + "grad_norm": 3.036313772201538, + "kl": 0.9956541359424591, + "learning_rate": 9.868588340605621e-08, + "loss": 0.0996, + "num_tokens": 34519498.0, + "reward": 0.74951171875, + "reward_std": 0.005567497573792934, + "rewards//mean": 0.74951171875, + "rewards//std": 0.03282622620463371, + "step": 3994 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.799, + "grad_norm": 3.495222330093384, + "kl": 1.2585793230682611, + "learning_rate": 9.849668182467807e-08, + "loss": 0.1259, + "num_tokens": 34528130.0, + "reward": 0.74700927734375, + "reward_std": 0.0030434252694249153, + "rewards//mean": 0.74700927734375, + "rewards//std": 0.02947710081934929, + "step": 3995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7992, + "grad_norm": 7.799647808074951, + "kl": 1.9383725747466087, + "learning_rate": 9.830764196878871e-08, + "loss": 0.1938, + "num_tokens": 34536882.0, + "reward": 0.77545166015625, + "reward_std": 0.01784929633140564, + "rewards//mean": 0.77545166015625, + "rewards//std": 0.03682184964418411, + "step": 3996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7994, + "grad_norm": 3.5614383220672607, + "kl": 0.8620524164289236, + "learning_rate": 9.811876391453294e-08, + "loss": 0.0862, + "num_tokens": 34545506.0, + "reward": 0.77386474609375, + "reward_std": 0.004311136901378632, + "rewards//mean": 0.77386474609375, + "rewards//std": 0.02702852338552475, + "step": 3997 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7996, + "grad_norm": 1.4340484142303467, + "kl": 1.4525028876960278, + "learning_rate": 9.793004773799102e-08, + "loss": 0.1453, + "num_tokens": 34554122.0, + "reward": 0.786865234375, + "reward_std": 0.008866466581821442, + "rewards//mean": 0.786865234375, + "rewards//std": 0.02712475322186947, + "step": 3998 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.7998, + "grad_norm": 1.5810558795928955, + "kl": 1.0686696972697973, + "learning_rate": 9.774149351517774e-08, + "loss": 0.1069, + "num_tokens": 34562786.0, + "reward": 0.7431640625, + "reward_std": 0.003059720853343606, + "rewards//mean": 0.7431640625, + "rewards//std": 0.02984665334224701, + "step": 3999 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completion_length": 128.0, + "epoch": 0.8, + "grad_norm": 13.422774314880371, + "kl": 2.5568079724907875, + "learning_rate": 9.755310132204297e-08, + "loss": 0.2557, + "num_tokens": 34571410.0, + "reward": 0.73638916015625, + "reward_std": 0.015636425465345383, + "rewards//mean": 0.73638916015625, + "rewards//std": 0.029919534921646118, + "step": 4000 + } + ], + "logging_steps": 1, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}