diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.04118616144975288, + "eval_steps": 1000, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4658203125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 88.3837890625, + "completions/mean_terminated_length": 53.83729553222656, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.0787938493303955, + "epoch": 0.00041186161449752884, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1026352643966675, + "kl": 1.5408068257727336e-05, + "learning_rate": 0.0, + "loss": 0.0612, + "num_tokens": 473618.0, + "reward": -0.654300332069397, + "reward_std": 1.2014957666397095, + "rewards/reward_model/mean": -0.654300332069397, + "rewards/reward_model/std": 1.4879947900772095, + "step": 1, + "step_time": 179.40438475832343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.521484375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 95.0380859375, + "completions/mean_terminated_length": 59.11632537841797, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.0889650019817054, + "epoch": 0.0008237232289950577, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9445520043373108, + "kl": 1.5487904489575044e-05, + "learning_rate": 1.2345679012345681e-08, + "loss": 0.0685, + "num_tokens": 944384.0, + "reward": -0.6944406032562256, + "reward_std": 1.1158981323242188, + "rewards/reward_model/mean": -0.6944406032562256, + "rewards/reward_model/std": 1.4779117107391357, + "step": 2, + "step_time": 168.28568758117035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4921875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 93.1064453125, + "completions/mean_terminated_length": 59.286537170410156, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.1078118681907654, + "epoch": 0.0012355848434925864, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9809994101524353, + "kl": 0.0009136445219155576, + "learning_rate": 2.4691358024691362e-08, + "loss": 0.061, + "num_tokens": 1417434.0, + "reward": -0.8067716956138611, + "reward_std": 1.1805193424224854, + "rewards/reward_model/mean": -0.8067716956138611, + "rewards/reward_model/std": 1.5296157598495483, + "step": 3, + "step_time": 168.7894278760068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4912109375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 90.3203125, + "completions/mean_terminated_length": 53.94241714477539, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.0778324585407972, + "epoch": 0.0016474464579901153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9622601270675659, + "kl": 0.0009502729969881329, + "learning_rate": 3.7037037037037036e-08, + "loss": 0.0854, + "num_tokens": 1886250.0, + "reward": -0.5533753037452698, + "reward_std": 1.0693888664245605, + "rewards/reward_model/mean": -0.5533753037452698, + "rewards/reward_model/std": 1.3799840211868286, + "step": 4, + "step_time": 167.94514833204448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.47802734375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 88.8896484375, + "completions/mean_terminated_length": 53.07202911376953, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.164030898362398, + "epoch": 0.002059308072487644, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0810271501541138, + "kl": 0.0010240612220968615, + "learning_rate": 4.9382716049382724e-08, + "loss": 0.0979, + "num_tokens": 2372616.0, + "reward": -0.8290466070175171, + "reward_std": 1.1383775472640991, + "rewards/reward_model/mean": -0.8290466070175171, + "rewards/reward_model/std": 1.4821057319641113, + "step": 5, + "step_time": 168.5208105482161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.44775390625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 86.54541015625, + "completions/mean_terminated_length": 52.934574127197266, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.121390470303595, + "epoch": 0.002471169686985173, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.026131272315979, + "kl": 0.000956788239591333, + "learning_rate": 6.17283950617284e-08, + "loss": 0.102, + "num_tokens": 2856453.0, + "reward": -0.5948619842529297, + "reward_std": 1.0859686136245728, + "rewards/reward_model/mean": -0.5948619842529297, + "rewards/reward_model/std": 1.4433753490447998, + "step": 6, + "step_time": 169.30755526619032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.47998046875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 88.40185546875, + "completions/mean_terminated_length": 51.85258483886719, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.1293580746278167, + "epoch": 0.002883031301482702, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9581882953643799, + "kl": 0.0010509827170608332, + "learning_rate": 7.407407407407407e-08, + "loss": 0.0752, + "num_tokens": 3349660.0, + "reward": -0.8746315836906433, + "reward_std": 1.1371493339538574, + "rewards/reward_model/mean": -0.8746315836906433, + "rewards/reward_model/std": 1.5432283878326416, + "step": 7, + "step_time": 170.4541406123899 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.462890625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 89.59423828125, + "completions/mean_terminated_length": 56.49545669555664, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.199626039713621, + "epoch": 0.0032948929159802307, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1233900785446167, + "kl": 0.0011659049921490805, + "learning_rate": 8.641975308641976e-08, + "loss": 0.0835, + "num_tokens": 3869181.0, + "reward": -0.9943232536315918, + "reward_std": 1.099515438079834, + "rewards/reward_model/mean": -0.9943232536315918, + "rewards/reward_model/std": 1.4042030572891235, + "step": 8, + "step_time": 168.8292339304462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.50244140625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 92.26611328125, + "completions/mean_terminated_length": 56.181549072265625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 2.103476638905704, + "epoch": 0.0037067545304777594, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0071220397949219, + "kl": 0.0010218678287401417, + "learning_rate": 9.876543209876545e-08, + "loss": 0.0786, + "num_tokens": 4330526.0, + "reward": -0.7287623286247253, + "reward_std": 1.2205724716186523, + "rewards/reward_model/mean": -0.7287623286247253, + "rewards/reward_model/std": 1.5410621166229248, + "step": 9, + "step_time": 168.4490856071934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4384765625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 87.068359375, + "completions/mean_terminated_length": 55.10608673095703, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 2.068316952791065, + "epoch": 0.004118616144975288, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0786375999450684, + "kl": 0.0011272716699295415, + "learning_rate": 1.1111111111111111e-07, + "loss": 0.0808, + "num_tokens": 4813482.0, + "reward": -0.8588310480117798, + "reward_std": 1.1204930543899536, + "rewards/reward_model/mean": -0.8588310480117798, + "rewards/reward_model/std": 1.4020955562591553, + "step": 10, + "step_time": 169.3498973324895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.486328125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 88.27197265625, + "completions/mean_terminated_length": 50.65874481201172, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.085946503095329, + "epoch": 0.004530477759472817, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1249303817749023, + "kl": 0.0010357791773003555, + "learning_rate": 1.234567901234568e-07, + "loss": 0.103, + "num_tokens": 5276279.0, + "reward": -0.7370425462722778, + "reward_std": 1.1393404006958008, + "rewards/reward_model/mean": -0.7370425462722778, + "rewards/reward_model/std": 1.435203194618225, + "step": 11, + "step_time": 169.61693120608106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.435546875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 84.62060546875, + "completions/mean_terminated_length": 51.147926330566406, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 2.0578739237971604, + "epoch": 0.004942339373970346, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1678587198257446, + "kl": 0.0010205526389199804, + "learning_rate": 1.3580246913580248e-07, + "loss": 0.129, + "num_tokens": 5750318.0, + "reward": -0.6621623039245605, + "reward_std": 1.1341545581817627, + "rewards/reward_model/mean": -0.6621623039245605, + "rewards/reward_model/std": 1.4956636428833008, + "step": 12, + "step_time": 170.6942683076486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.48876953125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 89.955078125, + "completions/mean_terminated_length": 53.581661224365234, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 2.1759862853214145, + "epoch": 0.005354200988467875, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9353419542312622, + "kl": 0.0009984489861381007, + "learning_rate": 1.4814814814814815e-07, + "loss": 0.0706, + "num_tokens": 6237106.0, + "reward": -0.71650230884552, + "reward_std": 1.1081366539001465, + "rewards/reward_model/mean": -0.71650230884552, + "rewards/reward_model/std": 1.4882901906967163, + "step": 13, + "step_time": 168.60461562033743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.47265625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 90.6298828125, + "completions/mean_terminated_length": 57.13518524169922, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.1225685542449355, + "epoch": 0.005766062602965404, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9926177263259888, + "kl": 0.0010822901392657513, + "learning_rate": 1.6049382716049383e-07, + "loss": 0.0705, + "num_tokens": 6768988.0, + "reward": -0.8033103346824646, + "reward_std": 1.1658474206924438, + "rewards/reward_model/mean": -0.8033103346824646, + "rewards/reward_model/std": 1.5343424081802368, + "step": 14, + "step_time": 169.76986178942025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.43359375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 87.02978515625, + "completions/mean_terminated_length": 55.666378021240234, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.2208039346151054, + "epoch": 0.006177924217462933, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.110655426979065, + "kl": 0.00102766218378747, + "learning_rate": 1.7283950617283952e-07, + "loss": 0.1137, + "num_tokens": 7264761.0, + "reward": -0.8211149573326111, + "reward_std": 1.1067304611206055, + "rewards/reward_model/mean": -0.8211149573326111, + "rewards/reward_model/std": 1.4263983964920044, + "step": 15, + "step_time": 169.37415388552472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.447265625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 89.8525390625, + "completions/mean_terminated_length": 58.984100341796875, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 2.036483039613813, + "epoch": 0.006589785831960461, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0956333875656128, + "kl": 0.0011385848249574337, + "learning_rate": 1.8518518518518518e-07, + "loss": 0.0679, + "num_tokens": 7745675.0, + "reward": -0.5313577651977539, + "reward_std": 1.1804759502410889, + "rewards/reward_model/mean": -0.5313577651977539, + "rewards/reward_model/std": 1.5051146745681763, + "step": 16, + "step_time": 168.84541190741584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.46142578125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 87.02197265625, + "completions/mean_terminated_length": 51.91387176513672, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 2.1997494087554514, + "epoch": 0.00700164744645799, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0221517086029053, + "kl": 0.0010186115512169636, + "learning_rate": 1.975308641975309e-07, + "loss": 0.0962, + "num_tokens": 8243704.0, + "reward": -0.89983731508255, + "reward_std": 1.135831356048584, + "rewards/reward_model/mean": -0.89983731508255, + "rewards/reward_model/std": 1.4320958852767944, + "step": 17, + "step_time": 168.78324813907966 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45947265625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 88.05908203125, + "completions/mean_terminated_length": 54.10749816894531, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.176318216137588, + "epoch": 0.007413509060955519, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0268303155899048, + "kl": 0.0010050315393073106, + "learning_rate": 2.0987654320987656e-07, + "loss": 0.0845, + "num_tokens": 8726801.0, + "reward": -0.7434755563735962, + "reward_std": 1.1786913871765137, + "rewards/reward_model/mean": -0.7434755563735962, + "rewards/reward_model/std": 1.4701310396194458, + "step": 18, + "step_time": 168.51915573468432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4658203125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 88.00439453125, + "completions/mean_terminated_length": 53.12705993652344, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.0752989565953612, + "epoch": 0.007825370675453048, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0904563665390015, + "kl": 0.00123250130627639, + "learning_rate": 2.2222222222222222e-07, + "loss": 0.0885, + "num_tokens": 9180858.0, + "reward": -0.8568893074989319, + "reward_std": 1.1963412761688232, + "rewards/reward_model/mean": -0.8568893074989319, + "rewards/reward_model/std": 1.5186042785644531, + "step": 19, + "step_time": 170.01141701499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4033203125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 83.22021484375, + "completions/mean_terminated_length": 52.95172119140625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 2.200795284938067, + "epoch": 0.008237232289950576, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0073621273040771, + "kl": 0.0012237756848207937, + "learning_rate": 2.3456790123456793e-07, + "loss": 0.0782, + "num_tokens": 9702557.0, + "reward": -0.9474191069602966, + "reward_std": 1.101952314376831, + "rewards/reward_model/mean": -0.9474191665649414, + "rewards/reward_model/std": 1.514784336090088, + "step": 20, + "step_time": 168.7904914407991 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.44189453125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 127.0, + "completions/mean_length": 85.07275390625, + "completions/mean_terminated_length": 51.08399200439453, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.1343746068887413, + "epoch": 0.008649093904448105, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.1044775247573853, + "kl": 0.0011666002192214364, + "learning_rate": 2.469135802469136e-07, + "loss": 0.0659, + "num_tokens": 10182002.0, + "reward": -0.8981258869171143, + "reward_std": 1.1897304058074951, + "rewards/reward_model/mean": -0.8981258869171143, + "rewards/reward_model/std": 1.4881244897842407, + "step": 21, + "step_time": 168.61277754418552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4580078125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 88.984375, + "completions/mean_terminated_length": 56.0144157409668, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.060287212021649, + "epoch": 0.009060955518945634, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9674479961395264, + "kl": 0.0012853052542141086, + "learning_rate": 2.5925925925925923e-07, + "loss": 0.0795, + "num_tokens": 10599858.0, + "reward": -0.7459607720375061, + "reward_std": 1.18560791015625, + "rewards/reward_model/mean": -0.7459607720375061, + "rewards/reward_model/std": 1.4447804689407349, + "step": 22, + "step_time": 168.57235636515543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45751953125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 87.20947265625, + "completions/mean_terminated_length": 52.80738067626953, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.9992840560153127, + "epoch": 0.009472817133443162, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2746193408966064, + "kl": 0.001352380840899059, + "learning_rate": 2.7160493827160497e-07, + "loss": 0.0805, + "num_tokens": 11135295.0, + "reward": -0.9941644668579102, + "reward_std": 1.2033442258834839, + "rewards/reward_model/mean": -0.9941644668579102, + "rewards/reward_model/std": 1.5118839740753174, + "step": 23, + "step_time": 168.93097670795396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.458984375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 88.12744140625, + "completions/mean_terminated_length": 54.300540924072266, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 2.1036753226071596, + "epoch": 0.009884678747940691, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9719477295875549, + "kl": 0.0014479562626092957, + "learning_rate": 2.839506172839506e-07, + "loss": 0.0792, + "num_tokens": 11647428.0, + "reward": -0.7246302366256714, + "reward_std": 1.1223700046539307, + "rewards/reward_model/mean": -0.7246302366256714, + "rewards/reward_model/std": 1.4486252069473267, + "step": 24, + "step_time": 168.1220847275108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.48876953125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 92.43017578125, + "completions/mean_terminated_length": 58.42311477661133, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.0290252747945487, + "epoch": 0.01029654036243822, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.036043643951416, + "kl": 0.0014065650616430503, + "learning_rate": 2.962962962962963e-07, + "loss": 0.0464, + "num_tokens": 12175925.0, + "reward": -0.8139803409576416, + "reward_std": 1.18918776512146, + "rewards/reward_model/mean": -0.8139803409576416, + "rewards/reward_model/std": 1.5184983015060425, + "step": 25, + "step_time": 169.08092289417982 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.48193359375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 91.38720703125, + "completions/mean_terminated_length": 57.3279914855957, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.0865674833767116, + "epoch": 0.01070840197693575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9221011996269226, + "kl": 0.0016496309149260924, + "learning_rate": 3.08641975308642e-07, + "loss": 0.0459, + "num_tokens": 12671022.0, + "reward": -0.6815944910049438, + "reward_std": 1.1987043619155884, + "rewards/reward_model/mean": -0.6815944910049438, + "rewards/reward_model/std": 1.503211259841919, + "step": 26, + "step_time": 169.66068721655756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.39208984375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 82.84814453125, + "completions/mean_terminated_length": 53.726104736328125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.100129804573953, + "epoch": 0.011120263591433279, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0829551219940186, + "kl": 0.0020542718434626295, + "learning_rate": 3.2098765432098767e-07, + "loss": 0.1111, + "num_tokens": 13159479.0, + "reward": -0.7841147780418396, + "reward_std": 1.1083781719207764, + "rewards/reward_model/mean": -0.7841147780418396, + "rewards/reward_model/std": 1.398116946220398, + "step": 27, + "step_time": 170.63890342088416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.41845703125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 84.29150390625, + "completions/mean_terminated_length": 52.84046936035156, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.0639419481158257, + "epoch": 0.011532125205930808, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9612188935279846, + "kl": 0.002871143702122936, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0511, + "num_tokens": 13624972.0, + "reward": -0.41133514046669006, + "reward_std": 1.0870225429534912, + "rewards/reward_model/mean": -0.41133514046669006, + "rewards/reward_model/std": 1.3928031921386719, + "step": 28, + "step_time": 169.01845826301724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3896484375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 82.62353515625, + "completions/mean_terminated_length": 53.65519714355469, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.0633218064904213, + "epoch": 0.011943986820428337, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0220043659210205, + "kl": 0.004001651409907936, + "learning_rate": 3.4567901234567904e-07, + "loss": 0.0608, + "num_tokens": 14084265.0, + "reward": -0.5280731916427612, + "reward_std": 1.139591097831726, + "rewards/reward_model/mean": -0.5280731916427612, + "rewards/reward_model/std": 1.5284217596054077, + "step": 29, + "step_time": 169.68904952565208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3447265625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 77.75439453125, + "completions/mean_terminated_length": 51.321163177490234, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.053064794279635, + "epoch": 0.012355848434925865, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0159528255462646, + "kl": 0.004774259470650577, + "learning_rate": 3.580246913580247e-07, + "loss": 0.0826, + "num_tokens": 14545778.0, + "reward": -0.8308598399162292, + "reward_std": 1.1439062356948853, + "rewards/reward_model/mean": -0.8308598399162292, + "rewards/reward_model/std": 1.4677071571350098, + "step": 30, + "step_time": 169.4162016301416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40478515625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 84.13525390625, + "completions/mean_terminated_length": 54.30434799194336, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.9778149635531008, + "epoch": 0.012767710049423394, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9643027186393738, + "kl": 0.004632160428627685, + "learning_rate": 3.7037037037037036e-07, + "loss": 0.0324, + "num_tokens": 15021479.0, + "reward": -0.5928993225097656, + "reward_std": 1.0915915966033936, + "rewards/reward_model/mean": -0.5928993225097656, + "rewards/reward_model/std": 1.4171936511993408, + "step": 31, + "step_time": 169.55369784962386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.39404296875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 81.93603515625, + "completions/mean_terminated_length": 51.981468200683594, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.9114997563883662, + "epoch": 0.013179571663920923, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9297622442245483, + "kl": 0.005185256256481807, + "learning_rate": 3.8271604938271605e-07, + "loss": 0.0641, + "num_tokens": 15490468.0, + "reward": -0.4294321537017822, + "reward_std": 1.1095049381256104, + "rewards/reward_model/mean": -0.4294321537017822, + "rewards/reward_model/std": 1.4001518487930298, + "step": 32, + "step_time": 169.97963417787105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4033203125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 84.73388671875, + "completions/mean_terminated_length": 55.48854446411133, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.9920633286237717, + "epoch": 0.013591433278418451, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9739150404930115, + "kl": 0.0050489629365984, + "learning_rate": 3.950617283950618e-07, + "loss": 0.069, + "num_tokens": 16033027.0, + "reward": -0.5853164792060852, + "reward_std": 1.1397128105163574, + "rewards/reward_model/mean": -0.5853164792060852, + "rewards/reward_model/std": 1.4342437982559204, + "step": 33, + "step_time": 170.0918092643842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.33349609375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 80.16943359375, + "completions/mean_terminated_length": 56.236629486083984, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.052212963812053, + "epoch": 0.01400329489291598, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.0108827352523804, + "kl": 0.0070345894837373635, + "learning_rate": 4.0740740740740737e-07, + "loss": 0.0717, + "num_tokens": 16443422.0, + "reward": -0.40320760011672974, + "reward_std": 1.023691177368164, + "rewards/reward_model/mean": -0.40320760011672974, + "rewards/reward_model/std": 1.3064631223678589, + "step": 34, + "step_time": 168.22575595136732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3896484375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 82.1884765625, + "completions/mean_terminated_length": 52.94239807128906, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.8943170690909028, + "epoch": 0.014415156507413509, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9463284015655518, + "kl": 0.006503033908302314, + "learning_rate": 4.197530864197531e-07, + "loss": 0.0771, + "num_tokens": 16958848.0, + "reward": -0.46641844511032104, + "reward_std": 1.1392958164215088, + "rewards/reward_model/mean": -0.46641844511032104, + "rewards/reward_model/std": 1.3904635906219482, + "step": 35, + "step_time": 169.57979472074658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2724609375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 69.291015625, + "completions/mean_terminated_length": 47.3046989440918, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 2.0284548006020486, + "epoch": 0.014827018121911038, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.202287197113037, + "kl": 0.008711300118193321, + "learning_rate": 4.320987654320988e-07, + "loss": 0.0853, + "num_tokens": 17445812.0, + "reward": -0.5143425464630127, + "reward_std": 1.080782175064087, + "rewards/reward_model/mean": -0.5143425464630127, + "rewards/reward_model/std": 1.3849540948867798, + "step": 36, + "step_time": 169.69654387421906 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3369140625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 77.77197265625, + "completions/mean_terminated_length": 52.25110626220703, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.8757861303165555, + "epoch": 0.015238879736408566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8937506675720215, + "kl": 0.010500569681425986, + "learning_rate": 4.4444444444444444e-07, + "loss": 0.0466, + "num_tokens": 17886785.0, + "reward": -0.2941930890083313, + "reward_std": 1.089874267578125, + "rewards/reward_model/mean": -0.2941930890083313, + "rewards/reward_model/std": 1.3422448635101318, + "step": 37, + "step_time": 168.3831845112145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.33203125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 78.78515625, + "completions/mean_terminated_length": 54.32163619995117, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.8339524874463677, + "epoch": 0.015650741350906095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8777432441711426, + "kl": 0.014370948238138226, + "learning_rate": 4.567901234567901e-07, + "loss": 0.0431, + "num_tokens": 18363593.0, + "reward": -0.21549299359321594, + "reward_std": 1.0654486417770386, + "rewards/reward_model/mean": -0.21549299359321594, + "rewards/reward_model/std": 1.286303997039795, + "step": 38, + "step_time": 168.6560257449746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3623046875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 80.3076171875, + "completions/mean_terminated_length": 53.211334228515625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.809513804037124, + "epoch": 0.016062602965403624, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.850346028804779, + "kl": 0.019628787720648688, + "learning_rate": 4.6913580246913586e-07, + "loss": -0.0144, + "num_tokens": 18782015.0, + "reward": -0.19260446727275848, + "reward_std": 1.0799050331115723, + "rewards/reward_model/mean": -0.19260446727275848, + "rewards/reward_model/std": 1.4198755025863647, + "step": 39, + "step_time": 169.06077374424785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3212890625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 78.080078125, + "completions/mean_terminated_length": 54.44892120361328, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.856378594879061, + "epoch": 0.016474464579901153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8225836753845215, + "kl": 0.023546985856228275, + "learning_rate": 4.814814814814815e-07, + "loss": 0.0268, + "num_tokens": 19238691.0, + "reward": -0.22110876441001892, + "reward_std": 1.0441968441009521, + "rewards/reward_model/mean": -0.22110876441001892, + "rewards/reward_model/std": 1.3271934986114502, + "step": 40, + "step_time": 169.5594472438097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.33251953125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 77.1416015625, + "completions/mean_terminated_length": 51.805416107177734, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.764324402436614, + "epoch": 0.01688632619439868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7773052453994751, + "kl": 0.029280464848852716, + "learning_rate": 4.938271604938272e-07, + "loss": 0.0194, + "num_tokens": 19755301.0, + "reward": -0.12512998282909393, + "reward_std": 1.0090844631195068, + "rewards/reward_model/mean": -0.12512998282909393, + "rewards/reward_model/std": 1.2345008850097656, + "step": 41, + "step_time": 170.39410974271595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.27490234375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 72.548828125, + "completions/mean_terminated_length": 51.52592468261719, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.758555585052818, + "epoch": 0.01729818780889621, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8362958431243896, + "kl": 0.03495379232481355, + "learning_rate": 5.061728395061729e-07, + "loss": 0.0014, + "num_tokens": 20199209.0, + "reward": -0.034443896263837814, + "reward_std": 1.0466477870941162, + "rewards/reward_model/mean": -0.034443896263837814, + "rewards/reward_model/std": 1.2755711078643799, + "step": 42, + "step_time": 170.17393092392012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25830078125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 74.35302734375, + "completions/mean_terminated_length": 55.6701774597168, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.6830066749826074, + "epoch": 0.01771004942339374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9207571148872375, + "kl": 0.03826815243519377, + "learning_rate": 5.185185185185185e-07, + "loss": 0.0197, + "num_tokens": 20667900.0, + "reward": -0.03724297881126404, + "reward_std": 0.9730924367904663, + "rewards/reward_model/mean": -0.03724297881126404, + "rewards/reward_model/std": 1.1648329496383667, + "step": 43, + "step_time": 168.8539799619466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3427734375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 82.59326171875, + "completions/mean_terminated_length": 58.911590576171875, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.722061135340482, + "epoch": 0.018121911037891267, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.691871166229248, + "kl": 0.035297417802212294, + "learning_rate": 5.308641975308642e-07, + "loss": 0.0252, + "num_tokens": 21084443.0, + "reward": 0.1364922821521759, + "reward_std": 0.9992862939834595, + "rewards/reward_model/mean": 0.1364922821521759, + "rewards/reward_model/std": 1.338813066482544, + "step": 44, + "step_time": 168.30778062017635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2216796875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 70.23193359375, + "completions/mean_terminated_length": 53.77854537963867, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.6960708745755255, + "epoch": 0.018533772652388796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8769946694374084, + "kl": 0.04808991262325435, + "learning_rate": 5.432098765432099e-07, + "loss": -0.0016, + "num_tokens": 21510710.0, + "reward": 0.1898983120918274, + "reward_std": 0.9757044911384583, + "rewards/reward_model/mean": 0.1898983120918274, + "rewards/reward_model/std": 1.1677379608154297, + "step": 45, + "step_time": 169.03864477854222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 77.9912109375, + "completions/mean_terminated_length": 58.42255401611328, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.6502066934481263, + "epoch": 0.018945634266886325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6699737906455994, + "kl": 0.04364871226789546, + "learning_rate": 5.555555555555555e-07, + "loss": 0.0298, + "num_tokens": 21968036.0, + "reward": 0.14871619641780853, + "reward_std": 0.8983126878738403, + "rewards/reward_model/mean": 0.14871619641780853, + "rewards/reward_model/std": 1.1425597667694092, + "step": 46, + "step_time": 169.17142802104354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.27294921875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 73.318359375, + "completions/mean_terminated_length": 52.789791107177734, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.806841244455427, + "epoch": 0.019357495881383854, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7548915147781372, + "kl": 0.050439021695638075, + "learning_rate": 5.679012345679012e-07, + "loss": 0.0261, + "num_tokens": 22428752.0, + "reward": 0.18833398818969727, + "reward_std": 0.9490935802459717, + "rewards/reward_model/mean": 0.18833398818969727, + "rewards/reward_model/std": 1.218595027923584, + "step": 47, + "step_time": 169.8053262718022 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.23583984375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 67.080078125, + "completions/mean_terminated_length": 48.278594970703125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.7256725700572133, + "epoch": 0.019769357495881382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8408772349357605, + "kl": 0.05761563615669729, + "learning_rate": 5.80246913580247e-07, + "loss": -0.0234, + "num_tokens": 22883444.0, + "reward": 0.10819900035858154, + "reward_std": 0.9136756062507629, + "rewards/reward_model/mean": 0.10819900035858154, + "rewards/reward_model/std": 1.13023042678833, + "step": 48, + "step_time": 170.28967663506046 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25927734375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 71.54296875, + "completions/mean_terminated_length": 51.78114700317383, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.676765794865787, + "epoch": 0.02018121911037891, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6633880138397217, + "kl": 0.05726497815339826, + "learning_rate": 5.925925925925926e-07, + "loss": 0.0077, + "num_tokens": 23311980.0, + "reward": 0.31039929389953613, + "reward_std": 0.8826955556869507, + "rewards/reward_model/mean": 0.31039929389953613, + "rewards/reward_model/std": 1.1599924564361572, + "step": 49, + "step_time": 169.17084869695827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2392578125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 73.46142578125, + "completions/mean_terminated_length": 56.30873107910156, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.5921450154855847, + "epoch": 0.02059308072487644, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7272253632545471, + "kl": 0.05662718684470747, + "learning_rate": 6.049382716049383e-07, + "loss": -0.0038, + "num_tokens": 23729245.0, + "reward": 0.2335912585258484, + "reward_std": 0.9175702929496765, + "rewards/reward_model/mean": 0.2335912585258484, + "rewards/reward_model/std": 1.1314274072647095, + "step": 50, + "step_time": 169.32695539435372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.26220703125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 77.47705078125, + "completions/mean_terminated_length": 59.521507263183594, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.630979296285659, + "epoch": 0.021004942339373972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7209051847457886, + "kl": 0.0596031873501488, + "learning_rate": 6.17283950617284e-07, + "loss": -0.019, + "num_tokens": 24171054.0, + "reward": 0.3881710171699524, + "reward_std": 0.9779696464538574, + "rewards/reward_model/mean": 0.3881710171699524, + "rewards/reward_model/std": 1.2501736879348755, + "step": 51, + "step_time": 169.39474018104374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3837890625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 87.35888671875, + "completions/mean_terminated_length": 62.0467529296875, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.5277671799995005, + "epoch": 0.0214168039538715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6657931208610535, + "kl": 0.04347534721819102, + "learning_rate": 6.296296296296296e-07, + "loss": 0.0084, + "num_tokens": 24640845.0, + "reward": 0.40088099241256714, + "reward_std": 0.8522671461105347, + "rewards/reward_model/mean": 0.40088099241256714, + "rewards/reward_model/std": 1.1760755777359009, + "step": 52, + "step_time": 169.99416326358914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.26708984375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 76.86572265625, + "completions/mean_terminated_length": 58.23118209838867, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.5355965252965689, + "epoch": 0.02182866556836903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7246338725090027, + "kl": 0.059198625254794024, + "learning_rate": 6.419753086419753e-07, + "loss": 0.0065, + "num_tokens": 25146234.0, + "reward": 0.32493141293525696, + "reward_std": 0.8951080441474915, + "rewards/reward_model/mean": 0.32493141293525696, + "rewards/reward_model/std": 1.109892725944519, + "step": 53, + "step_time": 170.4134237067774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.26220703125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 77.32373046875, + "completions/mean_terminated_length": 59.313697814941406, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.4214782847557217, + "epoch": 0.022240527182866558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.663506805896759, + "kl": 0.07469483163731638, + "learning_rate": 6.54320987654321e-07, + "loss": -0.0106, + "num_tokens": 25556273.0, + "reward": 0.5579333305358887, + "reward_std": 0.8257571458816528, + "rewards/reward_model/mean": 0.5579333305358887, + "rewards/reward_model/std": 1.0652962923049927, + "step": 54, + "step_time": 168.6097109238617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2158203125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 67.8369140625, + "completions/mean_terminated_length": 51.278953552246094, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.4409256265498698, + "epoch": 0.022652388797364087, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6998105645179749, + "kl": 0.09127819760760758, + "learning_rate": 6.666666666666666e-07, + "loss": -0.0145, + "num_tokens": 26009315.0, + "reward": 0.36190831661224365, + "reward_std": 0.8500241637229919, + "rewards/reward_model/mean": 0.36190831661224365, + "rewards/reward_model/std": 1.0737853050231934, + "step": 55, + "step_time": 169.63903413154185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2265625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 71.69873046875, + "completions/mean_terminated_length": 55.20643997192383, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.4477170635946095, + "epoch": 0.023064250411861616, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6856215596199036, + "kl": 0.10684883118665311, + "learning_rate": 6.790123456790124e-07, + "loss": -0.0154, + "num_tokens": 26453082.0, + "reward": 0.5452687740325928, + "reward_std": 0.7654911875724792, + "rewards/reward_model/mean": 0.5452687740325928, + "rewards/reward_model/std": 0.9965056777000427, + "step": 56, + "step_time": 167.3982848683372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.31689453125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 80.6259765625, + "completions/mean_terminated_length": 58.6490364074707, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.3876500492915511, + "epoch": 0.023476112026359144, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5802730321884155, + "kl": 0.08226650860888185, + "learning_rate": 6.913580246913581e-07, + "loss": -0.0124, + "num_tokens": 26859036.0, + "reward": 0.4985049366950989, + "reward_std": 0.8241320252418518, + "rewards/reward_model/mean": 0.4985049366950989, + "rewards/reward_model/std": 1.177066683769226, + "step": 57, + "step_time": 168.99145932588726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.29150390625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 81.41162109375, + "completions/mean_terminated_length": 62.24327850341797, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.3736186842434108, + "epoch": 0.023887973640856673, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.672706127166748, + "kl": 0.0943700206480571, + "learning_rate": 7.037037037037037e-07, + "loss": 0.0084, + "num_tokens": 27318439.0, + "reward": 0.7748833894729614, + "reward_std": 0.8400471806526184, + "rewards/reward_model/mean": 0.7748833894729614, + "rewards/reward_model/std": 1.0984324216842651, + "step": 58, + "step_time": 168.8087218273431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.30419921875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 79.111328125, + "completions/mean_terminated_length": 57.737545013427734, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.3578353270422667, + "epoch": 0.024299835255354202, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6414256691932678, + "kl": 0.10844983752758708, + "learning_rate": 7.160493827160494e-07, + "loss": 0.0049, + "num_tokens": 27845483.0, + "reward": 0.6581840515136719, + "reward_std": 0.7642059326171875, + "rewards/reward_model/mean": 0.6581840515136719, + "rewards/reward_model/std": 1.0230196714401245, + "step": 59, + "step_time": 170.09878712054342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.20947265625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 68.9013671875, + "completions/mean_terminated_length": 53.24150848388672, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.359937054105103, + "epoch": 0.02471169686985173, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6327216029167175, + "kl": 0.12389619748864789, + "learning_rate": 7.283950617283951e-07, + "loss": -0.0019, + "num_tokens": 28360609.0, + "reward": 0.5726691484451294, + "reward_std": 0.7263065576553345, + "rewards/reward_model/mean": 0.5726691484451294, + "rewards/reward_model/std": 1.0532201528549194, + "step": 60, + "step_time": 169.033332105726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21728515625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 74.28515625, + "completions/mean_terminated_length": 59.37367248535156, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.2873307822737843, + "epoch": 0.02512355848434926, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6471663117408752, + "kl": 0.11947597128164489, + "learning_rate": 7.407407407407407e-07, + "loss": -0.018, + "num_tokens": 28796649.0, + "reward": 0.8057171106338501, + "reward_std": 0.6930927038192749, + "rewards/reward_model/mean": 0.8057171106338501, + "rewards/reward_model/std": 0.9504708647727966, + "step": 61, + "step_time": 170.3676045727916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.27880859375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 80.5634765625, + "completions/mean_terminated_length": 62.22477722167969, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.4397811936214566, + "epoch": 0.025535420098846788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5935730934143066, + "kl": 0.11211827301303856, + "learning_rate": 7.530864197530865e-07, + "loss": 0.005, + "num_tokens": 29272715.0, + "reward": 0.5376583337783813, + "reward_std": 0.7316970825195312, + "rewards/reward_model/mean": 0.5376583337783813, + "rewards/reward_model/std": 1.0817116498947144, + "step": 62, + "step_time": 170.07809142861515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.27978515625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 83.00634765625, + "completions/mean_terminated_length": 65.52745819091797, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.2591755213215947, + "epoch": 0.025947281713344317, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6332337260246277, + "kl": 0.09935169246455189, + "learning_rate": 7.654320987654321e-07, + "loss": -0.0047, + "num_tokens": 29675416.0, + "reward": 0.8634133338928223, + "reward_std": 0.7280638217926025, + "rewards/reward_model/mean": 0.8634133338928223, + "rewards/reward_model/std": 1.0552853345870972, + "step": 63, + "step_time": 169.48511258373037 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 79.23291015625, + "completions/mean_terminated_length": 64.30420684814453, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.3228048181626946, + "epoch": 0.026359143327841845, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5896514058113098, + "kl": 0.10629570209130179, + "learning_rate": 7.777777777777778e-07, + "loss": 0.0018, + "num_tokens": 30103349.0, + "reward": 0.7413797378540039, + "reward_std": 0.6787456274032593, + "rewards/reward_model/mean": 0.7413797378540039, + "rewards/reward_model/std": 0.9844362735748291, + "step": 64, + "step_time": 168.49784950073808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2080078125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 77.34521484375, + "completions/mean_terminated_length": 64.04130554199219, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.3859816826879978, + "epoch": 0.026771004942339374, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.569113552570343, + "kl": 0.1355650291661732, + "learning_rate": 7.901234567901236e-07, + "loss": 0.0008, + "num_tokens": 30598360.0, + "reward": 0.8138879537582397, + "reward_std": 0.6921124458312988, + "rewards/reward_model/mean": 0.8138879537582397, + "rewards/reward_model/std": 1.008180856704712, + "step": 65, + "step_time": 168.94378049625084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2333984375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 76.275390625, + "completions/mean_terminated_length": 60.52738952636719, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.276806804118678, + "epoch": 0.027182866556836903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.584193229675293, + "kl": 0.12889757197990548, + "learning_rate": 8.024691358024692e-07, + "loss": 0.0109, + "num_tokens": 31028524.0, + "reward": 0.7695643901824951, + "reward_std": 0.7420451641082764, + "rewards/reward_model/mean": 0.7695643901824951, + "rewards/reward_model/std": 1.1982769966125488, + "step": 66, + "step_time": 169.0462037078105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2314453125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 81.75390625, + "completions/mean_terminated_length": 67.82718658447266, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.2658436398487538, + "epoch": 0.02759472817133443, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5793888568878174, + "kl": 0.10512553945591208, + "learning_rate": 8.148148148148147e-07, + "loss": 0.019, + "num_tokens": 31459860.0, + "reward": 0.9362199306488037, + "reward_std": 0.6280190944671631, + "rewards/reward_model/mean": 0.9362199306488037, + "rewards/reward_model/std": 1.003322958946228, + "step": 67, + "step_time": 168.24778978247195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.20068359375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 75.294921875, + "completions/mean_terminated_length": 62.06230926513672, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.3218755372799933, + "epoch": 0.02800658978583196, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.619716465473175, + "kl": 0.13950362912146375, + "learning_rate": 8.271604938271605e-07, + "loss": 0.0032, + "num_tokens": 31900336.0, + "reward": 0.7856715321540833, + "reward_std": 0.6523309946060181, + "rewards/reward_model/mean": 0.7856715321540833, + "rewards/reward_model/std": 0.9243690371513367, + "step": 68, + "step_time": 168.62237379932776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 72.62060546875, + "completions/mean_terminated_length": 59.84074783325195, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.2817874399479479, + "epoch": 0.02841845140032949, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6988398432731628, + "kl": 0.1405531533237081, + "learning_rate": 8.395061728395062e-07, + "loss": 0.0001, + "num_tokens": 32349991.0, + "reward": 0.7539228200912476, + "reward_std": 0.6927404403686523, + "rewards/reward_model/mean": 0.7539228200912476, + "rewards/reward_model/std": 1.1138005256652832, + "step": 69, + "step_time": 168.95463426411152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.18310546875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 76.68505859375, + "completions/mean_terminated_length": 65.18290710449219, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 1.3052547052502632, + "epoch": 0.028830313014827018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5859233140945435, + "kl": 0.14809596522536594, + "learning_rate": 8.518518518518518e-07, + "loss": 0.006, + "num_tokens": 32799778.0, + "reward": 0.899767279624939, + "reward_std": 0.6600509881973267, + "rewards/reward_model/mean": 0.899767279624939, + "rewards/reward_model/std": 1.0378800630569458, + "step": 70, + "step_time": 168.27934673754498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21435546875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 77.1396484375, + "completions/mean_terminated_length": 63.26289749145508, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.2957828119397163, + "epoch": 0.029242174629324547, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6063217520713806, + "kl": 0.1499464159278432, + "learning_rate": 8.641975308641976e-07, + "loss": 0.0009, + "num_tokens": 33258528.0, + "reward": 0.9532963037490845, + "reward_std": 0.5860557556152344, + "rewards/reward_model/mean": 0.9532963037490845, + "rewards/reward_model/std": 0.9753101468086243, + "step": 71, + "step_time": 168.12876597139984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.123046875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 71.07666015625, + "completions/mean_terminated_length": 63.08964538574219, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.2831250824965537, + "epoch": 0.029654036243822075, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.650992214679718, + "kl": 0.14023015335260425, + "learning_rate": 8.765432098765433e-07, + "loss": 0.0053, + "num_tokens": 33708125.0, + "reward": 0.8464133739471436, + "reward_std": 0.5981078147888184, + "rewards/reward_model/mean": 0.8464133739471436, + "rewards/reward_model/std": 0.9848034977912903, + "step": 72, + "step_time": 168.81821045372635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1435546875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 70.923828125, + "completions/mean_terminated_length": 61.356895446777344, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.27306635864079, + "epoch": 0.030065897858319604, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6549474000930786, + "kl": 0.15989831037586555, + "learning_rate": 8.888888888888889e-07, + "loss": -0.0137, + "num_tokens": 34163265.0, + "reward": 0.9670735001564026, + "reward_std": 0.590969979763031, + "rewards/reward_model/mean": 0.9670735001564026, + "rewards/reward_model/std": 0.9453141689300537, + "step": 73, + "step_time": 169.4882780299522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15234375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 76.10107421875, + "completions/mean_terminated_length": 66.77362060546875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.2641989213880152, + "epoch": 0.030477759472817133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5717378854751587, + "kl": 0.13075158167339396, + "learning_rate": 9.012345679012347e-07, + "loss": 0.0049, + "num_tokens": 34635568.0, + "reward": 1.0525561571121216, + "reward_std": 0.5589165687561035, + "rewards/reward_model/mean": 1.0525561571121216, + "rewards/reward_model/std": 0.8849756121635437, + "step": 74, + "step_time": 169.85129849473014 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.18017578125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 76.9130859375, + "completions/mean_terminated_length": 65.6855239868164, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.2528326134197414, + "epoch": 0.03088962108731466, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5799550414085388, + "kl": 0.13488844226230867, + "learning_rate": 9.135802469135802e-07, + "loss": 0.0233, + "num_tokens": 35042110.0, + "reward": 1.0121339559555054, + "reward_std": 0.6093316078186035, + "rewards/reward_model/mean": 1.0121339559555054, + "rewards/reward_model/std": 0.9795147776603699, + "step": 75, + "step_time": 168.39488552790135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 72.35791015625, + "completions/mean_terminated_length": 65.24944305419922, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.2186853648163378, + "epoch": 0.03130148270181219, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6350732445716858, + "kl": 0.13198584888596088, + "learning_rate": 9.259259259259259e-07, + "loss": 0.0245, + "num_tokens": 35454619.0, + "reward": 1.1278910636901855, + "reward_std": 0.6185814738273621, + "rewards/reward_model/mean": 1.1278910636901855, + "rewards/reward_model/std": 0.9232901930809021, + "step": 76, + "step_time": 169.25788368703797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.13134765625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 70.81689453125, + "completions/mean_terminated_length": 62.170318603515625, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.247056140564382, + "epoch": 0.03171334431630972, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.759410560131073, + "kl": 0.16601154587988276, + "learning_rate": 9.382716049382717e-07, + "loss": -0.0003, + "num_tokens": 35845316.0, + "reward": 1.0192276239395142, + "reward_std": 0.5931369066238403, + "rewards/reward_model/mean": 1.0192276239395142, + "rewards/reward_model/std": 0.9772949814796448, + "step": 77, + "step_time": 168.0306376479566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09521484375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 70.13916015625, + "completions/mean_terminated_length": 64.05018615722656, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.2540333659853786, + "epoch": 0.03212520593080725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6740376353263855, + "kl": 0.1378997444990091, + "learning_rate": 9.506172839506173e-07, + "loss": -0.013, + "num_tokens": 36287137.0, + "reward": 0.9819941520690918, + "reward_std": 0.604373574256897, + "rewards/reward_model/mean": 0.9819941520690918, + "rewards/reward_model/std": 0.9436709880828857, + "step": 78, + "step_time": 168.8950103893876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09130859375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 68.37744140625, + "completions/mean_terminated_length": 62.3863525390625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.1720305329654366, + "epoch": 0.032537067545304776, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6466286182403564, + "kl": 0.12133494461886585, + "learning_rate": 9.62962962962963e-07, + "loss": -0.0048, + "num_tokens": 36719654.0, + "reward": 1.1645737886428833, + "reward_std": 0.5557790398597717, + "rewards/reward_model/mean": 1.1645737886428833, + "rewards/reward_model/std": 0.9391114711761475, + "step": 79, + "step_time": 169.75694013293833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.13330078125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 69.42236328125, + "completions/mean_terminated_length": 60.412960052490234, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.2074508473742753, + "epoch": 0.032948929159802305, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7178550362586975, + "kl": 0.16269512700091582, + "learning_rate": 9.753086419753088e-07, + "loss": 0.0024, + "num_tokens": 37198599.0, + "reward": 1.0070809125900269, + "reward_std": 0.6197090148925781, + "rewards/reward_model/mean": 1.0070809125900269, + "rewards/reward_model/std": 0.8695884943008423, + "step": 80, + "step_time": 170.65315298642963 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10888671875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 69.23583984375, + "completions/mean_terminated_length": 62.05534362792969, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.1959036465268582, + "epoch": 0.033360790774299834, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6767128705978394, + "kl": 0.1640322696766816, + "learning_rate": 9.876543209876544e-07, + "loss": -0.0102, + "num_tokens": 37627498.0, + "reward": 1.051206111907959, + "reward_std": 0.6312122344970703, + "rewards/reward_model/mean": 1.051206111907959, + "rewards/reward_model/std": 1.006866455078125, + "step": 81, + "step_time": 169.01685216045007 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15234375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 76.1484375, + "completions/mean_terminated_length": 66.8294906616211, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.2353556689340621, + "epoch": 0.03377265238879736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6121835708618164, + "kl": 0.13194930272584315, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 38077466.0, + "reward": 1.1334960460662842, + "reward_std": 0.5429809093475342, + "rewards/reward_model/mean": 1.1334960460662842, + "rewards/reward_model/std": 1.0062233209609985, + "step": 82, + "step_time": 169.77917499747127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1455078125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 74.54931640625, + "completions/mean_terminated_length": 65.44742584228516, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.2487247881945223, + "epoch": 0.03418451400329489, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8223838210105896, + "kl": 0.1559760042873677, + "learning_rate": 1.0123456790123457e-06, + "loss": 0.0192, + "num_tokens": 38560095.0, + "reward": 1.0274322032928467, + "reward_std": 0.6149877309799194, + "rewards/reward_model/mean": 1.0274322032928467, + "rewards/reward_model/std": 0.8848612308502197, + "step": 83, + "step_time": 168.7728981245309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0693359375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 68.9453125, + "completions/mean_terminated_length": 64.54563903808594, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.203527741599828, + "epoch": 0.03459637561779242, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6430616974830627, + "kl": 0.16221579984994605, + "learning_rate": 1.0246913580246913e-06, + "loss": -0.0054, + "num_tokens": 38989743.0, + "reward": 1.1352043151855469, + "reward_std": 0.5808489918708801, + "rewards/reward_model/mean": 1.1352043151855469, + "rewards/reward_model/std": 1.0034772157669067, + "step": 84, + "step_time": 167.36356884567067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08544921875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 73.13720703125, + "completions/mean_terminated_length": 68.01121520996094, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1932638001162559, + "epoch": 0.03500823723228995, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.618569552898407, + "kl": 0.1509321930789156, + "learning_rate": 1.037037037037037e-06, + "loss": -0.0105, + "num_tokens": 39413960.0, + "reward": 1.1917307376861572, + "reward_std": 0.6115972995758057, + "rewards/reward_model/mean": 1.1917307376861572, + "rewards/reward_model/std": 0.8577749729156494, + "step": 85, + "step_time": 169.4022615076974 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1328125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 74.16455078125, + "completions/mean_terminated_length": 65.91948699951172, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.1511160423979163, + "epoch": 0.03542009884678748, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6170347929000854, + "kl": 0.16613518859958276, + "learning_rate": 1.0493827160493827e-06, + "loss": 0.0068, + "num_tokens": 39897081.0, + "reward": 1.2492460012435913, + "reward_std": 0.6051790714263916, + "rewards/reward_model/mean": 1.2492460012435913, + "rewards/reward_model/std": 0.8991779685020447, + "step": 86, + "step_time": 168.7876625736244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 73.81884765625, + "completions/mean_terminated_length": 68.34247589111328, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 1.216100089251995, + "epoch": 0.035831960461285006, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6116214394569397, + "kl": 0.17693783454888035, + "learning_rate": 1.0617283950617285e-06, + "loss": 0.014, + "num_tokens": 40304870.0, + "reward": 1.327715277671814, + "reward_std": 0.5273313522338867, + "rewards/reward_model/mean": 1.327715277671814, + "rewards/reward_model/std": 0.8829416036605835, + "step": 87, + "step_time": 170.00215818034485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10498046875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 82.087890625, + "completions/mean_terminated_length": 76.70267486572266, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.2015210629906505, + "epoch": 0.036243822075782535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5836432576179504, + "kl": 0.14079495580517687, + "learning_rate": 1.074074074074074e-06, + "loss": 0.0043, + "num_tokens": 40738362.0, + "reward": 1.2116522789001465, + "reward_std": 0.5785905122756958, + "rewards/reward_model/mean": 1.2116522789001465, + "rewards/reward_model/std": 0.8599736094474792, + "step": 88, + "step_time": 170.04560359567404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07080078125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 71.1552734375, + "completions/mean_terminated_length": 66.82395935058594, + "completions/min_length": 1.0, + "completions/min_terminated_length": 1.0, + "entropy": 1.1739569688215852, + "epoch": 0.036655683690280064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5943244099617004, + "kl": 0.16060514625860378, + "learning_rate": 1.0864197530864199e-06, + "loss": -0.0115, + "num_tokens": 41174584.0, + "reward": 1.079056739807129, + "reward_std": 0.5492511987686157, + "rewards/reward_model/mean": 1.079056739807129, + "rewards/reward_model/std": 0.8875714540481567, + "step": 89, + "step_time": 170.3232544688508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08203125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 73.50341796875, + "completions/mean_terminated_length": 68.63350677490234, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.2288584825582802, + "epoch": 0.03706754530477759, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6955690979957581, + "kl": 0.18743510471540503, + "learning_rate": 1.0987654320987655e-06, + "loss": 0.0392, + "num_tokens": 41634879.0, + "reward": 1.2760483026504517, + "reward_std": 0.5275530219078064, + "rewards/reward_model/mean": 1.2760483026504517, + "rewards/reward_model/std": 0.8526185154914856, + "step": 90, + "step_time": 169.28955688048154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 77.4619140625, + "completions/mean_terminated_length": 71.62635803222656, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "entropy": 1.1926879836246371, + "epoch": 0.03747940691927512, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6305184364318848, + "kl": 0.16981972678331658, + "learning_rate": 1.111111111111111e-06, + "loss": -0.0032, + "num_tokens": 42093777.0, + "reward": 1.3892216682434082, + "reward_std": 0.5210399627685547, + "rewards/reward_model/mean": 1.3892216682434082, + "rewards/reward_model/std": 0.8532023429870605, + "step": 91, + "step_time": 169.93193591805175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1005859375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 80.02099609375, + "completions/mean_terminated_length": 74.65526580810547, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "entropy": 1.220891160191968, + "epoch": 0.03789126853377265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5791942477226257, + "kl": 0.13990605663275346, + "learning_rate": 1.1234567901234568e-06, + "loss": 0.021, + "num_tokens": 42533916.0, + "reward": 1.3777389526367188, + "reward_std": 0.5628249049186707, + "rewards/reward_model/mean": 1.3777389526367188, + "rewards/reward_model/std": 0.8695874214172363, + "step": 92, + "step_time": 168.67672005156055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.13427734375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 81.66845703125, + "completions/mean_terminated_length": 74.48223876953125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 1.1679968070238829, + "epoch": 0.03830313014827018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.577627956867218, + "kl": 0.14946075380430557, + "learning_rate": 1.1358024691358024e-06, + "loss": 0.0005, + "num_tokens": 42977269.0, + "reward": 1.2400810718536377, + "reward_std": 0.5488580465316772, + "rewards/reward_model/mean": 1.2400810718536377, + "rewards/reward_model/std": 0.8818415999412537, + "step": 93, + "step_time": 169.11300712404773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1591796875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 89.02783203125, + "completions/mean_terminated_length": 81.64982604980469, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.2256716336123645, + "epoch": 0.03871499176276771, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3020151853561401, + "kl": 0.1575723221176304, + "learning_rate": 1.1481481481481482e-06, + "loss": 0.0078, + "num_tokens": 43468782.0, + "reward": 1.2878694534301758, + "reward_std": 0.5034958124160767, + "rewards/reward_model/mean": 1.2878694534301758, + "rewards/reward_model/std": 0.8000524640083313, + "step": 94, + "step_time": 168.66197129152715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.146484375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 82.29296875, + "completions/mean_terminated_length": 74.4485092163086, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.2125889593735337, + "epoch": 0.039126853377265236, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5775403380393982, + "kl": 0.17348909995052963, + "learning_rate": 1.160493827160494e-06, + "loss": 0.0291, + "num_tokens": 43925606.0, + "reward": 1.3064830303192139, + "reward_std": 0.5317621231079102, + "rewards/reward_model/mean": 1.3064830303192139, + "rewards/reward_model/std": 0.8767746090888977, + "step": 95, + "step_time": 168.66238435404375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11376953125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 83.8466796875, + "completions/mean_terminated_length": 78.17851257324219, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.2149158080574125, + "epoch": 0.039538714991762765, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5917447209358215, + "kl": 0.16629101379658096, + "learning_rate": 1.1728395061728396e-06, + "loss": 0.0372, + "num_tokens": 44365228.0, + "reward": 1.310151219367981, + "reward_std": 0.5394536852836609, + "rewards/reward_model/mean": 1.310151219367981, + "rewards/reward_model/std": 0.8472654223442078, + "step": 96, + "step_time": 170.65063601452857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.087890625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 81.22119140625, + "completions/mean_terminated_length": 76.7136001586914, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.2331861625425518, + "epoch": 0.039950576606260293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6160045266151428, + "kl": 0.18450818251585588, + "learning_rate": 1.1851851851851852e-06, + "loss": 0.0239, + "num_tokens": 44809841.0, + "reward": 1.388469934463501, + "reward_std": 0.47841960191726685, + "rewards/reward_model/mean": 1.388469934463501, + "rewards/reward_model/std": 0.7604539394378662, + "step": 97, + "step_time": 168.5139070255682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.14404296875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 86.03857421875, + "completions/mean_terminated_length": 78.97718811035156, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "entropy": 1.2734931902959943, + "epoch": 0.04036243822075782, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.8496507406234741, + "kl": 0.1486935554712545, + "learning_rate": 1.197530864197531e-06, + "loss": 0.024, + "num_tokens": 45291392.0, + "reward": 1.273500680923462, + "reward_std": 0.5131819844245911, + "rewards/reward_model/mean": 1.273500680923462, + "rewards/reward_model/std": 0.8183842301368713, + "step": 98, + "step_time": 169.52931605745107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07861328125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 76.30078125, + "completions/mean_terminated_length": 71.8897705078125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 1.210193380014971, + "epoch": 0.04077429983525535, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6533998847007751, + "kl": 0.16968276555417106, + "learning_rate": 1.2098765432098765e-06, + "loss": 0.0227, + "num_tokens": 45769608.0, + "reward": 1.374595046043396, + "reward_std": 0.5025352835655212, + "rewards/reward_model/mean": 1.374595046043396, + "rewards/reward_model/std": 0.7355093359947205, + "step": 99, + "step_time": 170.76728575211018 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10595703125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 80.27490234375, + "completions/mean_terminated_length": 74.61878204345703, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "entropy": 1.1873215795494616, + "epoch": 0.04118616144975288, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5959328413009644, + "kl": 0.14610896681551822, + "learning_rate": 1.2222222222222221e-06, + "loss": 0.0218, + "num_tokens": 46193339.0, + "reward": 1.393322229385376, + "reward_std": 0.5245035886764526, + "rewards/reward_model/mean": 1.393322229385376, + "rewards/reward_model/std": 0.8767962455749512, + "step": 100, + "step_time": 169.11898464756086 + } + ], + "logging_steps": 1, + "max_steps": 2428, + "num_input_tokens_seen": 46193339, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}