grpo_hh_checkpoint_100 / trainer_state.json

Upload folder using huggingface_hub

00d8bcb verified about 1 month ago

105 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.04118616144975288,
	"eval_steps": 1000,
	"global_step": 100,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.4658203125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 88.3837890625,
	"completions/mean_terminated_length": 53.83729553222656,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.0787938493303955,
	"epoch": 0.00041186161449752884,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.1026352643966675,
	"kl": 1.5408068257727336e-05,
	"learning_rate": 0.0,
	"loss": 0.0612,
	"num_tokens": 473618.0,
	"reward": -0.654300332069397,
	"reward_std": 1.2014957666397095,
	"rewards/reward_model/mean": -0.654300332069397,
	"rewards/reward_model/std": 1.4879947900772095,
	"step": 1,
	"step_time": 179.40438475832343
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.521484375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 95.0380859375,
	"completions/mean_terminated_length": 59.11632537841797,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.0889650019817054,
	"epoch": 0.0008237232289950577,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9445520043373108,
	"kl": 1.5487904489575044e-05,
	"learning_rate": 1.2345679012345681e-08,
	"loss": 0.0685,
	"num_tokens": 944384.0,
	"reward": -0.6944406032562256,
	"reward_std": 1.1158981323242188,
	"rewards/reward_model/mean": -0.6944406032562256,
	"rewards/reward_model/std": 1.4779117107391357,
	"step": 2,
	"step_time": 168.28568758117035
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.4921875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 93.1064453125,
	"completions/mean_terminated_length": 59.286537170410156,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.1078118681907654,
	"epoch": 0.0012355848434925864,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9809994101524353,
	"kl": 0.0009136445219155576,
	"learning_rate": 2.4691358024691362e-08,
	"loss": 0.061,
	"num_tokens": 1417434.0,
	"reward": -0.8067716956138611,
	"reward_std": 1.1805193424224854,
	"rewards/reward_model/mean": -0.8067716956138611,
	"rewards/reward_model/std": 1.5296157598495483,
	"step": 3,
	"step_time": 168.7894278760068
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.4912109375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 90.3203125,
	"completions/mean_terminated_length": 53.94241714477539,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.0778324585407972,
	"epoch": 0.0016474464579901153,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9622601270675659,
	"kl": 0.0009502729969881329,
	"learning_rate": 3.7037037037037036e-08,
	"loss": 0.0854,
	"num_tokens": 1886250.0,
	"reward": -0.5533753037452698,
	"reward_std": 1.0693888664245605,
	"rewards/reward_model/mean": -0.5533753037452698,
	"rewards/reward_model/std": 1.3799840211868286,
	"step": 4,
	"step_time": 167.94514833204448
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.47802734375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 88.8896484375,
	"completions/mean_terminated_length": 53.07202911376953,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.164030898362398,
	"epoch": 0.002059308072487644,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0810271501541138,
	"kl": 0.0010240612220968615,
	"learning_rate": 4.9382716049382724e-08,
	"loss": 0.0979,
	"num_tokens": 2372616.0,
	"reward": -0.8290466070175171,
	"reward_std": 1.1383775472640991,
	"rewards/reward_model/mean": -0.8290466070175171,
	"rewards/reward_model/std": 1.4821057319641113,
	"step": 5,
	"step_time": 168.5208105482161
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.44775390625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 86.54541015625,
	"completions/mean_terminated_length": 52.934574127197266,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.121390470303595,
	"epoch": 0.002471169686985173,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.026131272315979,
	"kl": 0.000956788239591333,
	"learning_rate": 6.17283950617284e-08,
	"loss": 0.102,
	"num_tokens": 2856453.0,
	"reward": -0.5948619842529297,
	"reward_std": 1.0859686136245728,
	"rewards/reward_model/mean": -0.5948619842529297,
	"rewards/reward_model/std": 1.4433753490447998,
	"step": 6,
	"step_time": 169.30755526619032
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.47998046875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 88.40185546875,
	"completions/mean_terminated_length": 51.85258483886719,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.1293580746278167,
	"epoch": 0.002883031301482702,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9581882953643799,
	"kl": 0.0010509827170608332,
	"learning_rate": 7.407407407407407e-08,
	"loss": 0.0752,
	"num_tokens": 3349660.0,
	"reward": -0.8746315836906433,
	"reward_std": 1.1371493339538574,
	"rewards/reward_model/mean": -0.8746315836906433,
	"rewards/reward_model/std": 1.5432283878326416,
	"step": 7,
	"step_time": 170.4541406123899
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.462890625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 89.59423828125,
	"completions/mean_terminated_length": 56.49545669555664,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.199626039713621,
	"epoch": 0.0032948929159802307,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.1233900785446167,
	"kl": 0.0011659049921490805,
	"learning_rate": 8.641975308641976e-08,
	"loss": 0.0835,
	"num_tokens": 3869181.0,
	"reward": -0.9943232536315918,
	"reward_std": 1.099515438079834,
	"rewards/reward_model/mean": -0.9943232536315918,
	"rewards/reward_model/std": 1.4042030572891235,
	"step": 8,
	"step_time": 168.8292339304462
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.50244140625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 92.26611328125,
	"completions/mean_terminated_length": 56.181549072265625,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 2.103476638905704,
	"epoch": 0.0037067545304777594,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0071220397949219,
	"kl": 0.0010218678287401417,
	"learning_rate": 9.876543209876545e-08,
	"loss": 0.0786,
	"num_tokens": 4330526.0,
	"reward": -0.7287623286247253,
	"reward_std": 1.2205724716186523,
	"rewards/reward_model/mean": -0.7287623286247253,
	"rewards/reward_model/std": 1.5410621166229248,
	"step": 9,
	"step_time": 168.4490856071934
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.4384765625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 87.068359375,
	"completions/mean_terminated_length": 55.10608673095703,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 2.068316952791065,
	"epoch": 0.004118616144975288,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0786375999450684,
	"kl": 0.0011272716699295415,
	"learning_rate": 1.1111111111111111e-07,
	"loss": 0.0808,
	"num_tokens": 4813482.0,
	"reward": -0.8588310480117798,
	"reward_std": 1.1204930543899536,
	"rewards/reward_model/mean": -0.8588310480117798,
	"rewards/reward_model/std": 1.4020955562591553,
	"step": 10,
	"step_time": 169.3498973324895
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.486328125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 88.27197265625,
	"completions/mean_terminated_length": 50.65874481201172,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.085946503095329,
	"epoch": 0.004530477759472817,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.1249303817749023,
	"kl": 0.0010357791773003555,
	"learning_rate": 1.234567901234568e-07,
	"loss": 0.103,
	"num_tokens": 5276279.0,
	"reward": -0.7370425462722778,
	"reward_std": 1.1393404006958008,
	"rewards/reward_model/mean": -0.7370425462722778,
	"rewards/reward_model/std": 1.435203194618225,
	"step": 11,
	"step_time": 169.61693120608106
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.435546875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 84.62060546875,
	"completions/mean_terminated_length": 51.147926330566406,
	"completions/min_length": 1.0,
	"completions/min_terminated_length": 1.0,
	"entropy": 2.0578739237971604,
	"epoch": 0.004942339373970346,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.1678587198257446,
	"kl": 0.0010205526389199804,
	"learning_rate": 1.3580246913580248e-07,
	"loss": 0.129,
	"num_tokens": 5750318.0,
	"reward": -0.6621623039245605,
	"reward_std": 1.1341545581817627,
	"rewards/reward_model/mean": -0.6621623039245605,
	"rewards/reward_model/std": 1.4956636428833008,
	"step": 12,
	"step_time": 170.6942683076486
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.48876953125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 89.955078125,
	"completions/mean_terminated_length": 53.581661224365234,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 2.1759862853214145,
	"epoch": 0.005354200988467875,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9353419542312622,
	"kl": 0.0009984489861381007,
	"learning_rate": 1.4814814814814815e-07,
	"loss": 0.0706,
	"num_tokens": 6237106.0,
	"reward": -0.71650230884552,
	"reward_std": 1.1081366539001465,
	"rewards/reward_model/mean": -0.71650230884552,
	"rewards/reward_model/std": 1.4882901906967163,
	"step": 13,
	"step_time": 168.60461562033743
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.47265625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 127.0,
	"completions/mean_length": 90.6298828125,
	"completions/mean_terminated_length": 57.13518524169922,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.1225685542449355,
	"epoch": 0.005766062602965404,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9926177263259888,
	"kl": 0.0010822901392657513,
	"learning_rate": 1.6049382716049383e-07,
	"loss": 0.0705,
	"num_tokens": 6768988.0,
	"reward": -0.8033103346824646,
	"reward_std": 1.1658474206924438,
	"rewards/reward_model/mean": -0.8033103346824646,
	"rewards/reward_model/std": 1.5343424081802368,
	"step": 14,
	"step_time": 169.76986178942025
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.43359375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 87.02978515625,
	"completions/mean_terminated_length": 55.666378021240234,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.2208039346151054,
	"epoch": 0.006177924217462933,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.110655426979065,
	"kl": 0.00102766218378747,
	"learning_rate": 1.7283950617283952e-07,
	"loss": 0.1137,
	"num_tokens": 7264761.0,
	"reward": -0.8211149573326111,
	"reward_std": 1.1067304611206055,
	"rewards/reward_model/mean": -0.8211149573326111,
	"rewards/reward_model/std": 1.4263983964920044,
	"step": 15,
	"step_time": 169.37415388552472
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.447265625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 89.8525390625,
	"completions/mean_terminated_length": 58.984100341796875,
	"completions/min_length": 1.0,
	"completions/min_terminated_length": 1.0,
	"entropy": 2.036483039613813,
	"epoch": 0.006589785831960461,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0956333875656128,
	"kl": 0.0011385848249574337,
	"learning_rate": 1.8518518518518518e-07,
	"loss": 0.0679,
	"num_tokens": 7745675.0,
	"reward": -0.5313577651977539,
	"reward_std": 1.1804759502410889,
	"rewards/reward_model/mean": -0.5313577651977539,
	"rewards/reward_model/std": 1.5051146745681763,
	"step": 16,
	"step_time": 168.84541190741584
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.46142578125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 87.02197265625,
	"completions/mean_terminated_length": 51.91387176513672,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 2.1997494087554514,
	"epoch": 0.00700164744645799,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0221517086029053,
	"kl": 0.0010186115512169636,
	"learning_rate": 1.975308641975309e-07,
	"loss": 0.0962,
	"num_tokens": 8243704.0,
	"reward": -0.89983731508255,
	"reward_std": 1.135831356048584,
	"rewards/reward_model/mean": -0.89983731508255,
	"rewards/reward_model/std": 1.4320958852767944,
	"step": 17,
	"step_time": 168.78324813907966
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.45947265625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 88.05908203125,
	"completions/mean_terminated_length": 54.10749816894531,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.176318216137588,
	"epoch": 0.007413509060955519,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0268303155899048,
	"kl": 0.0010050315393073106,
	"learning_rate": 2.0987654320987656e-07,
	"loss": 0.0845,
	"num_tokens": 8726801.0,
	"reward": -0.7434755563735962,
	"reward_std": 1.1786913871765137,
	"rewards/reward_model/mean": -0.7434755563735962,
	"rewards/reward_model/std": 1.4701310396194458,
	"step": 18,
	"step_time": 168.51915573468432
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.4658203125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 88.00439453125,
	"completions/mean_terminated_length": 53.12705993652344,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.0752989565953612,
	"epoch": 0.007825370675453048,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0904563665390015,
	"kl": 0.00123250130627639,
	"learning_rate": 2.2222222222222222e-07,
	"loss": 0.0885,
	"num_tokens": 9180858.0,
	"reward": -0.8568893074989319,
	"reward_std": 1.1963412761688232,
	"rewards/reward_model/mean": -0.8568893074989319,
	"rewards/reward_model/std": 1.5186042785644531,
	"step": 19,
	"step_time": 170.01141701499
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.4033203125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 83.22021484375,
	"completions/mean_terminated_length": 52.95172119140625,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 2.200795284938067,
	"epoch": 0.008237232289950576,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0073621273040771,
	"kl": 0.0012237756848207937,
	"learning_rate": 2.3456790123456793e-07,
	"loss": 0.0782,
	"num_tokens": 9702557.0,
	"reward": -0.9474191069602966,
	"reward_std": 1.101952314376831,
	"rewards/reward_model/mean": -0.9474191665649414,
	"rewards/reward_model/std": 1.514784336090088,
	"step": 20,
	"step_time": 168.7904914407991
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.44189453125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 127.0,
	"completions/mean_length": 85.07275390625,
	"completions/mean_terminated_length": 51.08399200439453,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.1343746068887413,
	"epoch": 0.008649093904448105,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.1044775247573853,
	"kl": 0.0011666002192214364,
	"learning_rate": 2.469135802469136e-07,
	"loss": 0.0659,
	"num_tokens": 10182002.0,
	"reward": -0.8981258869171143,
	"reward_std": 1.1897304058074951,
	"rewards/reward_model/mean": -0.8981258869171143,
	"rewards/reward_model/std": 1.4881244897842407,
	"step": 21,
	"step_time": 168.61277754418552
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.4580078125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 88.984375,
	"completions/mean_terminated_length": 56.0144157409668,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.060287212021649,
	"epoch": 0.009060955518945634,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9674479961395264,
	"kl": 0.0012853052542141086,
	"learning_rate": 2.5925925925925923e-07,
	"loss": 0.0795,
	"num_tokens": 10599858.0,
	"reward": -0.7459607720375061,
	"reward_std": 1.18560791015625,
	"rewards/reward_model/mean": -0.7459607720375061,
	"rewards/reward_model/std": 1.4447804689407349,
	"step": 22,
	"step_time": 168.57235636515543
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.45751953125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 87.20947265625,
	"completions/mean_terminated_length": 52.80738067626953,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.9992840560153127,
	"epoch": 0.009472817133443162,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.2746193408966064,
	"kl": 0.001352380840899059,
	"learning_rate": 2.7160493827160497e-07,
	"loss": 0.0805,
	"num_tokens": 11135295.0,
	"reward": -0.9941644668579102,
	"reward_std": 1.2033442258834839,
	"rewards/reward_model/mean": -0.9941644668579102,
	"rewards/reward_model/std": 1.5118839740753174,
	"step": 23,
	"step_time": 168.93097670795396
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.458984375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 88.12744140625,
	"completions/mean_terminated_length": 54.300540924072266,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 2.1036753226071596,
	"epoch": 0.009884678747940691,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9719477295875549,
	"kl": 0.0014479562626092957,
	"learning_rate": 2.839506172839506e-07,
	"loss": 0.0792,
	"num_tokens": 11647428.0,
	"reward": -0.7246302366256714,
	"reward_std": 1.1223700046539307,
	"rewards/reward_model/mean": -0.7246302366256714,
	"rewards/reward_model/std": 1.4486252069473267,
	"step": 24,
	"step_time": 168.1220847275108
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.48876953125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 92.43017578125,
	"completions/mean_terminated_length": 58.42311477661133,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.0290252747945487,
	"epoch": 0.01029654036243822,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.036043643951416,
	"kl": 0.0014065650616430503,
	"learning_rate": 2.962962962962963e-07,
	"loss": 0.0464,
	"num_tokens": 12175925.0,
	"reward": -0.8139803409576416,
	"reward_std": 1.18918776512146,
	"rewards/reward_model/mean": -0.8139803409576416,
	"rewards/reward_model/std": 1.5184983015060425,
	"step": 25,
	"step_time": 169.08092289417982
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.48193359375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 91.38720703125,
	"completions/mean_terminated_length": 57.3279914855957,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.0865674833767116,
	"epoch": 0.01070840197693575,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9221011996269226,
	"kl": 0.0016496309149260924,
	"learning_rate": 3.08641975308642e-07,
	"loss": 0.0459,
	"num_tokens": 12671022.0,
	"reward": -0.6815944910049438,
	"reward_std": 1.1987043619155884,
	"rewards/reward_model/mean": -0.6815944910049438,
	"rewards/reward_model/std": 1.503211259841919,
	"step": 26,
	"step_time": 169.66068721655756
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.39208984375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 82.84814453125,
	"completions/mean_terminated_length": 53.726104736328125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.100129804573953,
	"epoch": 0.011120263591433279,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0829551219940186,
	"kl": 0.0020542718434626295,
	"learning_rate": 3.2098765432098767e-07,
	"loss": 0.1111,
	"num_tokens": 13159479.0,
	"reward": -0.7841147780418396,
	"reward_std": 1.1083781719207764,
	"rewards/reward_model/mean": -0.7841147780418396,
	"rewards/reward_model/std": 1.398116946220398,
	"step": 27,
	"step_time": 170.63890342088416
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.41845703125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 84.29150390625,
	"completions/mean_terminated_length": 52.84046936035156,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.0639419481158257,
	"epoch": 0.011532125205930808,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9612188935279846,
	"kl": 0.002871143702122936,
	"learning_rate": 3.333333333333333e-07,
	"loss": 0.0511,
	"num_tokens": 13624972.0,
	"reward": -0.41133514046669006,
	"reward_std": 1.0870225429534912,
	"rewards/reward_model/mean": -0.41133514046669006,
	"rewards/reward_model/std": 1.3928031921386719,
	"step": 28,
	"step_time": 169.01845826301724
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3896484375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 82.62353515625,
	"completions/mean_terminated_length": 53.65519714355469,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.0633218064904213,
	"epoch": 0.011943986820428337,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0220043659210205,
	"kl": 0.004001651409907936,
	"learning_rate": 3.4567901234567904e-07,
	"loss": 0.0608,
	"num_tokens": 14084265.0,
	"reward": -0.5280731916427612,
	"reward_std": 1.139591097831726,
	"rewards/reward_model/mean": -0.5280731916427612,
	"rewards/reward_model/std": 1.5284217596054077,
	"step": 29,
	"step_time": 169.68904952565208
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3447265625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 77.75439453125,
	"completions/mean_terminated_length": 51.321163177490234,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.053064794279635,
	"epoch": 0.012355848434925865,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0159528255462646,
	"kl": 0.004774259470650577,
	"learning_rate": 3.580246913580247e-07,
	"loss": 0.0826,
	"num_tokens": 14545778.0,
	"reward": -0.8308598399162292,
	"reward_std": 1.1439062356948853,
	"rewards/reward_model/mean": -0.8308598399162292,
	"rewards/reward_model/std": 1.4677071571350098,
	"step": 30,
	"step_time": 169.4162016301416
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.40478515625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 84.13525390625,
	"completions/mean_terminated_length": 54.30434799194336,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.9778149635531008,
	"epoch": 0.012767710049423394,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9643027186393738,
	"kl": 0.004632160428627685,
	"learning_rate": 3.7037037037037036e-07,
	"loss": 0.0324,
	"num_tokens": 15021479.0,
	"reward": -0.5928993225097656,
	"reward_std": 1.0915915966033936,
	"rewards/reward_model/mean": -0.5928993225097656,
	"rewards/reward_model/std": 1.4171936511993408,
	"step": 31,
	"step_time": 169.55369784962386
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.39404296875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 81.93603515625,
	"completions/mean_terminated_length": 51.981468200683594,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.9114997563883662,
	"epoch": 0.013179571663920923,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9297622442245483,
	"kl": 0.005185256256481807,
	"learning_rate": 3.8271604938271605e-07,
	"loss": 0.0641,
	"num_tokens": 15490468.0,
	"reward": -0.4294321537017822,
	"reward_std": 1.1095049381256104,
	"rewards/reward_model/mean": -0.4294321537017822,
	"rewards/reward_model/std": 1.4001518487930298,
	"step": 32,
	"step_time": 169.97963417787105
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.4033203125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 84.73388671875,
	"completions/mean_terminated_length": 55.48854446411133,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.9920633286237717,
	"epoch": 0.013591433278418451,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9739150404930115,
	"kl": 0.0050489629365984,
	"learning_rate": 3.950617283950618e-07,
	"loss": 0.069,
	"num_tokens": 16033027.0,
	"reward": -0.5853164792060852,
	"reward_std": 1.1397128105163574,
	"rewards/reward_model/mean": -0.5853164792060852,
	"rewards/reward_model/std": 1.4342437982559204,
	"step": 33,
	"step_time": 170.0918092643842
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.33349609375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 80.16943359375,
	"completions/mean_terminated_length": 56.236629486083984,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.052212963812053,
	"epoch": 0.01400329489291598,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.0108827352523804,
	"kl": 0.0070345894837373635,
	"learning_rate": 4.0740740740740737e-07,
	"loss": 0.0717,
	"num_tokens": 16443422.0,
	"reward": -0.40320760011672974,
	"reward_std": 1.023691177368164,
	"rewards/reward_model/mean": -0.40320760011672974,
	"rewards/reward_model/std": 1.3064631223678589,
	"step": 34,
	"step_time": 168.22575595136732
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3896484375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 82.1884765625,
	"completions/mean_terminated_length": 52.94239807128906,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.8943170690909028,
	"epoch": 0.014415156507413509,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9463284015655518,
	"kl": 0.006503033908302314,
	"learning_rate": 4.197530864197531e-07,
	"loss": 0.0771,
	"num_tokens": 16958848.0,
	"reward": -0.46641844511032104,
	"reward_std": 1.1392958164215088,
	"rewards/reward_model/mean": -0.46641844511032104,
	"rewards/reward_model/std": 1.3904635906219482,
	"step": 35,
	"step_time": 169.57979472074658
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2724609375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 69.291015625,
	"completions/mean_terminated_length": 47.3046989440918,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 2.0284548006020486,
	"epoch": 0.014827018121911038,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.202287197113037,
	"kl": 0.008711300118193321,
	"learning_rate": 4.320987654320988e-07,
	"loss": 0.0853,
	"num_tokens": 17445812.0,
	"reward": -0.5143425464630127,
	"reward_std": 1.080782175064087,
	"rewards/reward_model/mean": -0.5143425464630127,
	"rewards/reward_model/std": 1.3849540948867798,
	"step": 36,
	"step_time": 169.69654387421906
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3369140625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 77.77197265625,
	"completions/mean_terminated_length": 52.25110626220703,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.8757861303165555,
	"epoch": 0.015238879736408566,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.8937506675720215,
	"kl": 0.010500569681425986,
	"learning_rate": 4.4444444444444444e-07,
	"loss": 0.0466,
	"num_tokens": 17886785.0,
	"reward": -0.2941930890083313,
	"reward_std": 1.089874267578125,
	"rewards/reward_model/mean": -0.2941930890083313,
	"rewards/reward_model/std": 1.3422448635101318,
	"step": 37,
	"step_time": 168.3831845112145
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.33203125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 78.78515625,
	"completions/mean_terminated_length": 54.32163619995117,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.8339524874463677,
	"epoch": 0.015650741350906095,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.8777432441711426,
	"kl": 0.014370948238138226,
	"learning_rate": 4.567901234567901e-07,
	"loss": 0.0431,
	"num_tokens": 18363593.0,
	"reward": -0.21549299359321594,
	"reward_std": 1.0654486417770386,
	"rewards/reward_model/mean": -0.21549299359321594,
	"rewards/reward_model/std": 1.286303997039795,
	"step": 38,
	"step_time": 168.6560257449746
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3623046875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 80.3076171875,
	"completions/mean_terminated_length": 53.211334228515625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.809513804037124,
	"epoch": 0.016062602965403624,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.850346028804779,
	"kl": 0.019628787720648688,
	"learning_rate": 4.6913580246913586e-07,
	"loss": -0.0144,
	"num_tokens": 18782015.0,
	"reward": -0.19260446727275848,
	"reward_std": 1.0799050331115723,
	"rewards/reward_model/mean": -0.19260446727275848,
	"rewards/reward_model/std": 1.4198755025863647,
	"step": 39,
	"step_time": 169.06077374424785
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3212890625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 78.080078125,
	"completions/mean_terminated_length": 54.44892120361328,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.856378594879061,
	"epoch": 0.016474464579901153,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.8225836753845215,
	"kl": 0.023546985856228275,
	"learning_rate": 4.814814814814815e-07,
	"loss": 0.0268,
	"num_tokens": 19238691.0,
	"reward": -0.22110876441001892,
	"reward_std": 1.0441968441009521,
	"rewards/reward_model/mean": -0.22110876441001892,
	"rewards/reward_model/std": 1.3271934986114502,
	"step": 40,
	"step_time": 169.5594472438097
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.33251953125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 77.1416015625,
	"completions/mean_terminated_length": 51.805416107177734,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.764324402436614,
	"epoch": 0.01688632619439868,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.7773052453994751,
	"kl": 0.029280464848852716,
	"learning_rate": 4.938271604938272e-07,
	"loss": 0.0194,
	"num_tokens": 19755301.0,
	"reward": -0.12512998282909393,
	"reward_std": 1.0090844631195068,
	"rewards/reward_model/mean": -0.12512998282909393,
	"rewards/reward_model/std": 1.2345008850097656,
	"step": 41,
	"step_time": 170.39410974271595
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.27490234375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 72.548828125,
	"completions/mean_terminated_length": 51.52592468261719,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.758555585052818,
	"epoch": 0.01729818780889621,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.8362958431243896,
	"kl": 0.03495379232481355,
	"learning_rate": 5.061728395061729e-07,
	"loss": 0.0014,
	"num_tokens": 20199209.0,
	"reward": -0.034443896263837814,
	"reward_std": 1.0466477870941162,
	"rewards/reward_model/mean": -0.034443896263837814,
	"rewards/reward_model/std": 1.2755711078643799,
	"step": 42,
	"step_time": 170.17393092392012
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.25830078125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 74.35302734375,
	"completions/mean_terminated_length": 55.6701774597168,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.6830066749826074,
	"epoch": 0.01771004942339374,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.9207571148872375,
	"kl": 0.03826815243519377,
	"learning_rate": 5.185185185185185e-07,
	"loss": 0.0197,
	"num_tokens": 20667900.0,
	"reward": -0.03724297881126404,
	"reward_std": 0.9730924367904663,
	"rewards/reward_model/mean": -0.03724297881126404,
	"rewards/reward_model/std": 1.1648329496383667,
	"step": 43,
	"step_time": 168.8539799619466
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3427734375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 82.59326171875,
	"completions/mean_terminated_length": 58.911590576171875,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.722061135340482,
	"epoch": 0.018121911037891267,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.691871166229248,
	"kl": 0.035297417802212294,
	"learning_rate": 5.308641975308642e-07,
	"loss": 0.0252,
	"num_tokens": 21084443.0,
	"reward": 0.1364922821521759,
	"reward_std": 0.9992862939834595,
	"rewards/reward_model/mean": 0.1364922821521759,
	"rewards/reward_model/std": 1.338813066482544,
	"step": 44,
	"step_time": 168.30778062017635
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2216796875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 70.23193359375,
	"completions/mean_terminated_length": 53.77854537963867,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.6960708745755255,
	"epoch": 0.018533772652388796,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.8769946694374084,
	"kl": 0.04808991262325435,
	"learning_rate": 5.432098765432099e-07,
	"loss": -0.0016,
	"num_tokens": 21510710.0,
	"reward": 0.1898983120918274,
	"reward_std": 0.9757044911384583,
	"rewards/reward_model/mean": 0.1898983120918274,
	"rewards/reward_model/std": 1.1677379608154297,
	"step": 45,
	"step_time": 169.03864477854222
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.28125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 77.9912109375,
	"completions/mean_terminated_length": 58.42255401611328,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.6502066934481263,
	"epoch": 0.018945634266886325,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6699737906455994,
	"kl": 0.04364871226789546,
	"learning_rate": 5.555555555555555e-07,
	"loss": 0.0298,
	"num_tokens": 21968036.0,
	"reward": 0.14871619641780853,
	"reward_std": 0.8983126878738403,
	"rewards/reward_model/mean": 0.14871619641780853,
	"rewards/reward_model/std": 1.1425597667694092,
	"step": 46,
	"step_time": 169.17142802104354
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.27294921875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 73.318359375,
	"completions/mean_terminated_length": 52.789791107177734,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.806841244455427,
	"epoch": 0.019357495881383854,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.7548915147781372,
	"kl": 0.050439021695638075,
	"learning_rate": 5.679012345679012e-07,
	"loss": 0.0261,
	"num_tokens": 22428752.0,
	"reward": 0.18833398818969727,
	"reward_std": 0.9490935802459717,
	"rewards/reward_model/mean": 0.18833398818969727,
	"rewards/reward_model/std": 1.218595027923584,
	"step": 47,
	"step_time": 169.8053262718022
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.23583984375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 67.080078125,
	"completions/mean_terminated_length": 48.278594970703125,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.7256725700572133,
	"epoch": 0.019769357495881382,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.8408772349357605,
	"kl": 0.05761563615669729,
	"learning_rate": 5.80246913580247e-07,
	"loss": -0.0234,
	"num_tokens": 22883444.0,
	"reward": 0.10819900035858154,
	"reward_std": 0.9136756062507629,
	"rewards/reward_model/mean": 0.10819900035858154,
	"rewards/reward_model/std": 1.13023042678833,
	"step": 48,
	"step_time": 170.28967663506046
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.25927734375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 71.54296875,
	"completions/mean_terminated_length": 51.78114700317383,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.676765794865787,
	"epoch": 0.02018121911037891,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6633880138397217,
	"kl": 0.05726497815339826,
	"learning_rate": 5.925925925925926e-07,
	"loss": 0.0077,
	"num_tokens": 23311980.0,
	"reward": 0.31039929389953613,
	"reward_std": 0.8826955556869507,
	"rewards/reward_model/mean": 0.31039929389953613,
	"rewards/reward_model/std": 1.1599924564361572,
	"step": 49,
	"step_time": 169.17084869695827
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2392578125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 73.46142578125,
	"completions/mean_terminated_length": 56.30873107910156,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.5921450154855847,
	"epoch": 0.02059308072487644,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.7272253632545471,
	"kl": 0.05662718684470747,
	"learning_rate": 6.049382716049383e-07,
	"loss": -0.0038,
	"num_tokens": 23729245.0,
	"reward": 0.2335912585258484,
	"reward_std": 0.9175702929496765,
	"rewards/reward_model/mean": 0.2335912585258484,
	"rewards/reward_model/std": 1.1314274072647095,
	"step": 50,
	"step_time": 169.32695539435372
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.26220703125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 77.47705078125,
	"completions/mean_terminated_length": 59.521507263183594,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.630979296285659,
	"epoch": 0.021004942339373972,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.7209051847457886,
	"kl": 0.0596031873501488,
	"learning_rate": 6.17283950617284e-07,
	"loss": -0.019,
	"num_tokens": 24171054.0,
	"reward": 0.3881710171699524,
	"reward_std": 0.9779696464538574,
	"rewards/reward_model/mean": 0.3881710171699524,
	"rewards/reward_model/std": 1.2501736879348755,
	"step": 51,
	"step_time": 169.39474018104374
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3837890625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 87.35888671875,
	"completions/mean_terminated_length": 62.0467529296875,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.5277671799995005,
	"epoch": 0.0214168039538715,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6657931208610535,
	"kl": 0.04347534721819102,
	"learning_rate": 6.296296296296296e-07,
	"loss": 0.0084,
	"num_tokens": 24640845.0,
	"reward": 0.40088099241256714,
	"reward_std": 0.8522671461105347,
	"rewards/reward_model/mean": 0.40088099241256714,
	"rewards/reward_model/std": 1.1760755777359009,
	"step": 52,
	"step_time": 169.99416326358914
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.26708984375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 76.86572265625,
	"completions/mean_terminated_length": 58.23118209838867,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.5355965252965689,
	"epoch": 0.02182866556836903,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.7246338725090027,
	"kl": 0.059198625254794024,
	"learning_rate": 6.419753086419753e-07,
	"loss": 0.0065,
	"num_tokens": 25146234.0,
	"reward": 0.32493141293525696,
	"reward_std": 0.8951080441474915,
	"rewards/reward_model/mean": 0.32493141293525696,
	"rewards/reward_model/std": 1.109892725944519,
	"step": 53,
	"step_time": 170.4134237067774
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.26220703125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 77.32373046875,
	"completions/mean_terminated_length": 59.313697814941406,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.4214782847557217,
	"epoch": 0.022240527182866558,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.663506805896759,
	"kl": 0.07469483163731638,
	"learning_rate": 6.54320987654321e-07,
	"loss": -0.0106,
	"num_tokens": 25556273.0,
	"reward": 0.5579333305358887,
	"reward_std": 0.8257571458816528,
	"rewards/reward_model/mean": 0.5579333305358887,
	"rewards/reward_model/std": 1.0652962923049927,
	"step": 54,
	"step_time": 168.6097109238617
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2158203125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 67.8369140625,
	"completions/mean_terminated_length": 51.278953552246094,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.4409256265498698,
	"epoch": 0.022652388797364087,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6998105645179749,
	"kl": 0.09127819760760758,
	"learning_rate": 6.666666666666666e-07,
	"loss": -0.0145,
	"num_tokens": 26009315.0,
	"reward": 0.36190831661224365,
	"reward_std": 0.8500241637229919,
	"rewards/reward_model/mean": 0.36190831661224365,
	"rewards/reward_model/std": 1.0737853050231934,
	"step": 55,
	"step_time": 169.63903413154185
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2265625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 71.69873046875,
	"completions/mean_terminated_length": 55.20643997192383,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.4477170635946095,
	"epoch": 0.023064250411861616,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6856215596199036,
	"kl": 0.10684883118665311,
	"learning_rate": 6.790123456790124e-07,
	"loss": -0.0154,
	"num_tokens": 26453082.0,
	"reward": 0.5452687740325928,
	"reward_std": 0.7654911875724792,
	"rewards/reward_model/mean": 0.5452687740325928,
	"rewards/reward_model/std": 0.9965056777000427,
	"step": 56,
	"step_time": 167.3982848683372
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.31689453125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 80.6259765625,
	"completions/mean_terminated_length": 58.6490364074707,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.3876500492915511,
	"epoch": 0.023476112026359144,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5802730321884155,
	"kl": 0.08226650860888185,
	"learning_rate": 6.913580246913581e-07,
	"loss": -0.0124,
	"num_tokens": 26859036.0,
	"reward": 0.4985049366950989,
	"reward_std": 0.8241320252418518,
	"rewards/reward_model/mean": 0.4985049366950989,
	"rewards/reward_model/std": 1.177066683769226,
	"step": 57,
	"step_time": 168.99145932588726
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.29150390625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 81.41162109375,
	"completions/mean_terminated_length": 62.24327850341797,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.3736186842434108,
	"epoch": 0.023887973640856673,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.672706127166748,
	"kl": 0.0943700206480571,
	"learning_rate": 7.037037037037037e-07,
	"loss": 0.0084,
	"num_tokens": 27318439.0,
	"reward": 0.7748833894729614,
	"reward_std": 0.8400471806526184,
	"rewards/reward_model/mean": 0.7748833894729614,
	"rewards/reward_model/std": 1.0984324216842651,
	"step": 58,
	"step_time": 168.8087218273431
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.30419921875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 79.111328125,
	"completions/mean_terminated_length": 57.737545013427734,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.3578353270422667,
	"epoch": 0.024299835255354202,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6414256691932678,
	"kl": 0.10844983752758708,
	"learning_rate": 7.160493827160494e-07,
	"loss": 0.0049,
	"num_tokens": 27845483.0,
	"reward": 0.6581840515136719,
	"reward_std": 0.7642059326171875,
	"rewards/reward_model/mean": 0.6581840515136719,
	"rewards/reward_model/std": 1.0230196714401245,
	"step": 59,
	"step_time": 170.09878712054342
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.20947265625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 68.9013671875,
	"completions/mean_terminated_length": 53.24150848388672,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.359937054105103,
	"epoch": 0.02471169686985173,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6327216029167175,
	"kl": 0.12389619748864789,
	"learning_rate": 7.283950617283951e-07,
	"loss": -0.0019,
	"num_tokens": 28360609.0,
	"reward": 0.5726691484451294,
	"reward_std": 0.7263065576553345,
	"rewards/reward_model/mean": 0.5726691484451294,
	"rewards/reward_model/std": 1.0532201528549194,
	"step": 60,
	"step_time": 169.033332105726
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.21728515625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 74.28515625,
	"completions/mean_terminated_length": 59.37367248535156,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.2873307822737843,
	"epoch": 0.02512355848434926,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6471663117408752,
	"kl": 0.11947597128164489,
	"learning_rate": 7.407407407407407e-07,
	"loss": -0.018,
	"num_tokens": 28796649.0,
	"reward": 0.8057171106338501,
	"reward_std": 0.6930927038192749,
	"rewards/reward_model/mean": 0.8057171106338501,
	"rewards/reward_model/std": 0.9504708647727966,
	"step": 61,
	"step_time": 170.3676045727916
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.27880859375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 80.5634765625,
	"completions/mean_terminated_length": 62.22477722167969,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.4397811936214566,
	"epoch": 0.025535420098846788,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5935730934143066,
	"kl": 0.11211827301303856,
	"learning_rate": 7.530864197530865e-07,
	"loss": 0.005,
	"num_tokens": 29272715.0,
	"reward": 0.5376583337783813,
	"reward_std": 0.7316970825195312,
	"rewards/reward_model/mean": 0.5376583337783813,
	"rewards/reward_model/std": 1.0817116498947144,
	"step": 62,
	"step_time": 170.07809142861515
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.27978515625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 83.00634765625,
	"completions/mean_terminated_length": 65.52745819091797,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.2591755213215947,
	"epoch": 0.025947281713344317,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6332337260246277,
	"kl": 0.09935169246455189,
	"learning_rate": 7.654320987654321e-07,
	"loss": -0.0047,
	"num_tokens": 29675416.0,
	"reward": 0.8634133338928223,
	"reward_std": 0.7280638217926025,
	"rewards/reward_model/mean": 0.8634133338928223,
	"rewards/reward_model/std": 1.0552853345870972,
	"step": 63,
	"step_time": 169.48511258373037
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.234375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 79.23291015625,
	"completions/mean_terminated_length": 64.30420684814453,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"entropy": 1.3228048181626946,
	"epoch": 0.026359143327841845,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5896514058113098,
	"kl": 0.10629570209130179,
	"learning_rate": 7.777777777777778e-07,
	"loss": 0.0018,
	"num_tokens": 30103349.0,
	"reward": 0.7413797378540039,
	"reward_std": 0.6787456274032593,
	"rewards/reward_model/mean": 0.7413797378540039,
	"rewards/reward_model/std": 0.9844362735748291,
	"step": 64,
	"step_time": 168.49784950073808
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2080078125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 77.34521484375,
	"completions/mean_terminated_length": 64.04130554199219,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.3859816826879978,
	"epoch": 0.026771004942339374,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.569113552570343,
	"kl": 0.1355650291661732,
	"learning_rate": 7.901234567901236e-07,
	"loss": 0.0008,
	"num_tokens": 30598360.0,
	"reward": 0.8138879537582397,
	"reward_std": 0.6921124458312988,
	"rewards/reward_model/mean": 0.8138879537582397,
	"rewards/reward_model/std": 1.008180856704712,
	"step": 65,
	"step_time": 168.94378049625084
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2333984375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 76.275390625,
	"completions/mean_terminated_length": 60.52738952636719,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.276806804118678,
	"epoch": 0.027182866556836903,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.584193229675293,
	"kl": 0.12889757197990548,
	"learning_rate": 8.024691358024692e-07,
	"loss": 0.0109,
	"num_tokens": 31028524.0,
	"reward": 0.7695643901824951,
	"reward_std": 0.7420451641082764,
	"rewards/reward_model/mean": 0.7695643901824951,
	"rewards/reward_model/std": 1.1982769966125488,
	"step": 66,
	"step_time": 169.0462037078105
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.2314453125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 81.75390625,
	"completions/mean_terminated_length": 67.82718658447266,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.2658436398487538,
	"epoch": 0.02759472817133443,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5793888568878174,
	"kl": 0.10512553945591208,
	"learning_rate": 8.148148148148147e-07,
	"loss": 0.019,
	"num_tokens": 31459860.0,
	"reward": 0.9362199306488037,
	"reward_std": 0.6280190944671631,
	"rewards/reward_model/mean": 0.9362199306488037,
	"rewards/reward_model/std": 1.003322958946228,
	"step": 67,
	"step_time": 168.24778978247195
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.20068359375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 75.294921875,
	"completions/mean_terminated_length": 62.06230926513672,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.3218755372799933,
	"epoch": 0.02800658978583196,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.619716465473175,
	"kl": 0.13950362912146375,
	"learning_rate": 8.271604938271605e-07,
	"loss": 0.0032,
	"num_tokens": 31900336.0,
	"reward": 0.7856715321540833,
	"reward_std": 0.6523309946060181,
	"rewards/reward_model/mean": 0.7856715321540833,
	"rewards/reward_model/std": 0.9243690371513367,
	"step": 68,
	"step_time": 168.62237379932776
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 72.62060546875,
	"completions/mean_terminated_length": 59.84074783325195,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.2817874399479479,
	"epoch": 0.02841845140032949,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6988398432731628,
	"kl": 0.1405531533237081,
	"learning_rate": 8.395061728395062e-07,
	"loss": 0.0001,
	"num_tokens": 32349991.0,
	"reward": 0.7539228200912476,
	"reward_std": 0.6927404403686523,
	"rewards/reward_model/mean": 0.7539228200912476,
	"rewards/reward_model/std": 1.1138005256652832,
	"step": 69,
	"step_time": 168.95463426411152
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.18310546875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 76.68505859375,
	"completions/mean_terminated_length": 65.18290710449219,
	"completions/min_length": 1.0,
	"completions/min_terminated_length": 1.0,
	"entropy": 1.3052547052502632,
	"epoch": 0.028830313014827018,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5859233140945435,
	"kl": 0.14809596522536594,
	"learning_rate": 8.518518518518518e-07,
	"loss": 0.006,
	"num_tokens": 32799778.0,
	"reward": 0.899767279624939,
	"reward_std": 0.6600509881973267,
	"rewards/reward_model/mean": 0.899767279624939,
	"rewards/reward_model/std": 1.0378800630569458,
	"step": 70,
	"step_time": 168.27934673754498
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.21435546875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 77.1396484375,
	"completions/mean_terminated_length": 63.26289749145508,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.2957828119397163,
	"epoch": 0.029242174629324547,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6063217520713806,
	"kl": 0.1499464159278432,
	"learning_rate": 8.641975308641976e-07,
	"loss": 0.0009,
	"num_tokens": 33258528.0,
	"reward": 0.9532963037490845,
	"reward_std": 0.5860557556152344,
	"rewards/reward_model/mean": 0.9532963037490845,
	"rewards/reward_model/std": 0.9753101468086243,
	"step": 71,
	"step_time": 168.12876597139984
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.123046875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 71.07666015625,
	"completions/mean_terminated_length": 63.08964538574219,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"entropy": 1.2831250824965537,
	"epoch": 0.029654036243822075,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.650992214679718,
	"kl": 0.14023015335260425,
	"learning_rate": 8.765432098765433e-07,
	"loss": 0.0053,
	"num_tokens": 33708125.0,
	"reward": 0.8464133739471436,
	"reward_std": 0.5981078147888184,
	"rewards/reward_model/mean": 0.8464133739471436,
	"rewards/reward_model/std": 0.9848034977912903,
	"step": 72,
	"step_time": 168.81821045372635
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1435546875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 70.923828125,
	"completions/mean_terminated_length": 61.356895446777344,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"entropy": 1.27306635864079,
	"epoch": 0.030065897858319604,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6549474000930786,
	"kl": 0.15989831037586555,
	"learning_rate": 8.888888888888889e-07,
	"loss": -0.0137,
	"num_tokens": 34163265.0,
	"reward": 0.9670735001564026,
	"reward_std": 0.590969979763031,
	"rewards/reward_model/mean": 0.9670735001564026,
	"rewards/reward_model/std": 0.9453141689300537,
	"step": 73,
	"step_time": 169.4882780299522
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.15234375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 76.10107421875,
	"completions/mean_terminated_length": 66.77362060546875,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"entropy": 1.2641989213880152,
	"epoch": 0.030477759472817133,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5717378854751587,
	"kl": 0.13075158167339396,
	"learning_rate": 9.012345679012347e-07,
	"loss": 0.0049,
	"num_tokens": 34635568.0,
	"reward": 1.0525561571121216,
	"reward_std": 0.5589165687561035,
	"rewards/reward_model/mean": 1.0525561571121216,
	"rewards/reward_model/std": 0.8849756121635437,
	"step": 74,
	"step_time": 169.85129849473014
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.18017578125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 76.9130859375,
	"completions/mean_terminated_length": 65.6855239868164,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"entropy": 1.2528326134197414,
	"epoch": 0.03088962108731466,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5799550414085388,
	"kl": 0.13488844226230867,
	"learning_rate": 9.135802469135802e-07,
	"loss": 0.0233,
	"num_tokens": 35042110.0,
	"reward": 1.0121339559555054,
	"reward_std": 0.6093316078186035,
	"rewards/reward_model/mean": 1.0121339559555054,
	"rewards/reward_model/std": 0.9795147776603699,
	"step": 75,
	"step_time": 168.39488552790135
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.11328125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 72.35791015625,
	"completions/mean_terminated_length": 65.24944305419922,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.2186853648163378,
	"epoch": 0.03130148270181219,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6350732445716858,
	"kl": 0.13198584888596088,
	"learning_rate": 9.259259259259259e-07,
	"loss": 0.0245,
	"num_tokens": 35454619.0,
	"reward": 1.1278910636901855,
	"reward_std": 0.6185814738273621,
	"rewards/reward_model/mean": 1.1278910636901855,
	"rewards/reward_model/std": 0.9232901930809021,
	"step": 76,
	"step_time": 169.25788368703797
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.13134765625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 70.81689453125,
	"completions/mean_terminated_length": 62.170318603515625,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.247056140564382,
	"epoch": 0.03171334431630972,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.759410560131073,
	"kl": 0.16601154587988276,
	"learning_rate": 9.382716049382717e-07,
	"loss": -0.0003,
	"num_tokens": 35845316.0,
	"reward": 1.0192276239395142,
	"reward_std": 0.5931369066238403,
	"rewards/reward_model/mean": 1.0192276239395142,
	"rewards/reward_model/std": 0.9772949814796448,
	"step": 77,
	"step_time": 168.0306376479566
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.09521484375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 70.13916015625,
	"completions/mean_terminated_length": 64.05018615722656,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"entropy": 1.2540333659853786,
	"epoch": 0.03212520593080725,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6740376353263855,
	"kl": 0.1378997444990091,
	"learning_rate": 9.506172839506173e-07,
	"loss": -0.013,
	"num_tokens": 36287137.0,
	"reward": 0.9819941520690918,
	"reward_std": 0.604373574256897,
	"rewards/reward_model/mean": 0.9819941520690918,
	"rewards/reward_model/std": 0.9436709880828857,
	"step": 78,
	"step_time": 168.8950103893876
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.09130859375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 68.37744140625,
	"completions/mean_terminated_length": 62.3863525390625,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.1720305329654366,
	"epoch": 0.032537067545304776,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6466286182403564,
	"kl": 0.12133494461886585,
	"learning_rate": 9.62962962962963e-07,
	"loss": -0.0048,
	"num_tokens": 36719654.0,
	"reward": 1.1645737886428833,
	"reward_std": 0.5557790398597717,
	"rewards/reward_model/mean": 1.1645737886428833,
	"rewards/reward_model/std": 0.9391114711761475,
	"step": 79,
	"step_time": 169.75694013293833
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.13330078125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 69.42236328125,
	"completions/mean_terminated_length": 60.412960052490234,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.2074508473742753,
	"epoch": 0.032948929159802305,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.7178550362586975,
	"kl": 0.16269512700091582,
	"learning_rate": 9.753086419753088e-07,
	"loss": 0.0024,
	"num_tokens": 37198599.0,
	"reward": 1.0070809125900269,
	"reward_std": 0.6197090148925781,
	"rewards/reward_model/mean": 1.0070809125900269,
	"rewards/reward_model/std": 0.8695884943008423,
	"step": 80,
	"step_time": 170.65315298642963
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.10888671875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 69.23583984375,
	"completions/mean_terminated_length": 62.05534362792969,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.1959036465268582,
	"epoch": 0.033360790774299834,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6767128705978394,
	"kl": 0.1640322696766816,
	"learning_rate": 9.876543209876544e-07,
	"loss": -0.0102,
	"num_tokens": 37627498.0,
	"reward": 1.051206111907959,
	"reward_std": 0.6312122344970703,
	"rewards/reward_model/mean": 1.051206111907959,
	"rewards/reward_model/std": 1.006866455078125,
	"step": 81,
	"step_time": 169.01685216045007
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.15234375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 76.1484375,
	"completions/mean_terminated_length": 66.8294906616211,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"entropy": 1.2353556689340621,
	"epoch": 0.03377265238879736,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6121835708618164,
	"kl": 0.13194930272584315,
	"learning_rate": 1e-06,
	"loss": -0.0084,
	"num_tokens": 38077466.0,
	"reward": 1.1334960460662842,
	"reward_std": 0.5429809093475342,
	"rewards/reward_model/mean": 1.1334960460662842,
	"rewards/reward_model/std": 1.0062233209609985,
	"step": 82,
	"step_time": 169.77917499747127
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1455078125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 74.54931640625,
	"completions/mean_terminated_length": 65.44742584228516,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.2487247881945223,
	"epoch": 0.03418451400329489,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.8223838210105896,
	"kl": 0.1559760042873677,
	"learning_rate": 1.0123456790123457e-06,
	"loss": 0.0192,
	"num_tokens": 38560095.0,
	"reward": 1.0274322032928467,
	"reward_std": 0.6149877309799194,
	"rewards/reward_model/mean": 1.0274322032928467,
	"rewards/reward_model/std": 0.8848612308502197,
	"step": 83,
	"step_time": 168.7728981245309
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0693359375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 68.9453125,
	"completions/mean_terminated_length": 64.54563903808594,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.203527741599828,
	"epoch": 0.03459637561779242,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6430616974830627,
	"kl": 0.16221579984994605,
	"learning_rate": 1.0246913580246913e-06,
	"loss": -0.0054,
	"num_tokens": 38989743.0,
	"reward": 1.1352043151855469,
	"reward_std": 0.5808489918708801,
	"rewards/reward_model/mean": 1.1352043151855469,
	"rewards/reward_model/std": 1.0034772157669067,
	"step": 84,
	"step_time": 167.36356884567067
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.08544921875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 73.13720703125,
	"completions/mean_terminated_length": 68.01121520996094,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"entropy": 1.1932638001162559,
	"epoch": 0.03500823723228995,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.618569552898407,
	"kl": 0.1509321930789156,
	"learning_rate": 1.037037037037037e-06,
	"loss": -0.0105,
	"num_tokens": 39413960.0,
	"reward": 1.1917307376861572,
	"reward_std": 0.6115972995758057,
	"rewards/reward_model/mean": 1.1917307376861572,
	"rewards/reward_model/std": 0.8577749729156494,
	"step": 85,
	"step_time": 169.4022615076974
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1328125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 74.16455078125,
	"completions/mean_terminated_length": 65.91948699951172,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"entropy": 1.1511160423979163,
	"epoch": 0.03542009884678748,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6170347929000854,
	"kl": 0.16613518859958276,
	"learning_rate": 1.0493827160493827e-06,
	"loss": 0.0068,
	"num_tokens": 39897081.0,
	"reward": 1.2492460012435913,
	"reward_std": 0.6051790714263916,
	"rewards/reward_model/mean": 1.2492460012435913,
	"rewards/reward_model/std": 0.8991779685020447,
	"step": 86,
	"step_time": 168.7876625736244
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.091796875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 73.81884765625,
	"completions/mean_terminated_length": 68.34247589111328,
	"completions/min_length": 1.0,
	"completions/min_terminated_length": 1.0,
	"entropy": 1.216100089251995,
	"epoch": 0.035831960461285006,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6116214394569397,
	"kl": 0.17693783454888035,
	"learning_rate": 1.0617283950617285e-06,
	"loss": 0.014,
	"num_tokens": 40304870.0,
	"reward": 1.327715277671814,
	"reward_std": 0.5273313522338867,
	"rewards/reward_model/mean": 1.327715277671814,
	"rewards/reward_model/std": 0.8829416036605835,
	"step": 87,
	"step_time": 170.00215818034485
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.10498046875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 82.087890625,
	"completions/mean_terminated_length": 76.70267486572266,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"entropy": 1.2015210629906505,
	"epoch": 0.036243822075782535,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5836432576179504,
	"kl": 0.14079495580517687,
	"learning_rate": 1.074074074074074e-06,
	"loss": 0.0043,
	"num_tokens": 40738362.0,
	"reward": 1.2116522789001465,
	"reward_std": 0.5785905122756958,
	"rewards/reward_model/mean": 1.2116522789001465,
	"rewards/reward_model/std": 0.8599736094474792,
	"step": 88,
	"step_time": 170.04560359567404
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.07080078125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 71.1552734375,
	"completions/mean_terminated_length": 66.82395935058594,
	"completions/min_length": 1.0,
	"completions/min_terminated_length": 1.0,
	"entropy": 1.1739569688215852,
	"epoch": 0.036655683690280064,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5943244099617004,
	"kl": 0.16060514625860378,
	"learning_rate": 1.0864197530864199e-06,
	"loss": -0.0115,
	"num_tokens": 41174584.0,
	"reward": 1.079056739807129,
	"reward_std": 0.5492511987686157,
	"rewards/reward_model/mean": 1.079056739807129,
	"rewards/reward_model/std": 0.8875714540481567,
	"step": 89,
	"step_time": 170.3232544688508
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.08203125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 73.50341796875,
	"completions/mean_terminated_length": 68.63350677490234,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"entropy": 1.2288584825582802,
	"epoch": 0.03706754530477759,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6955690979957581,
	"kl": 0.18743510471540503,
	"learning_rate": 1.0987654320987655e-06,
	"loss": 0.0392,
	"num_tokens": 41634879.0,
	"reward": 1.2760483026504517,
	"reward_std": 0.5275530219078064,
	"rewards/reward_model/mean": 1.2760483026504517,
	"rewards/reward_model/std": 0.8526185154914856,
	"step": 90,
	"step_time": 169.28955688048154
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.103515625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 77.4619140625,
	"completions/mean_terminated_length": 71.62635803222656,
	"completions/min_length": 2.0,
	"completions/min_terminated_length": 2.0,
	"entropy": 1.1926879836246371,
	"epoch": 0.03747940691927512,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6305184364318848,
	"kl": 0.16981972678331658,
	"learning_rate": 1.111111111111111e-06,
	"loss": -0.0032,
	"num_tokens": 42093777.0,
	"reward": 1.3892216682434082,
	"reward_std": 0.5210399627685547,
	"rewards/reward_model/mean": 1.3892216682434082,
	"rewards/reward_model/std": 0.8532023429870605,
	"step": 91,
	"step_time": 169.93193591805175
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1005859375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 80.02099609375,
	"completions/mean_terminated_length": 74.65526580810547,
	"completions/min_length": 3.0,
	"completions/min_terminated_length": 3.0,
	"entropy": 1.220891160191968,
	"epoch": 0.03789126853377265,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5791942477226257,
	"kl": 0.13990605663275346,
	"learning_rate": 1.1234567901234568e-06,
	"loss": 0.021,
	"num_tokens": 42533916.0,
	"reward": 1.3777389526367188,
	"reward_std": 0.5628249049186707,
	"rewards/reward_model/mean": 1.3777389526367188,
	"rewards/reward_model/std": 0.8695874214172363,
	"step": 92,
	"step_time": 168.67672005156055
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.13427734375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 81.66845703125,
	"completions/mean_terminated_length": 74.48223876953125,
	"completions/min_length": 8.0,
	"completions/min_terminated_length": 8.0,
	"entropy": 1.1679968070238829,
	"epoch": 0.03830313014827018,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.577627956867218,
	"kl": 0.14946075380430557,
	"learning_rate": 1.1358024691358024e-06,
	"loss": 0.0005,
	"num_tokens": 42977269.0,
	"reward": 1.2400810718536377,
	"reward_std": 0.5488580465316772,
	"rewards/reward_model/mean": 1.2400810718536377,
	"rewards/reward_model/std": 0.8818415999412537,
	"step": 93,
	"step_time": 169.11300712404773
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1591796875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 89.02783203125,
	"completions/mean_terminated_length": 81.64982604980469,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"entropy": 1.2256716336123645,
	"epoch": 0.03871499176276771,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 1.3020151853561401,
	"kl": 0.1575723221176304,
	"learning_rate": 1.1481481481481482e-06,
	"loss": 0.0078,
	"num_tokens": 43468782.0,
	"reward": 1.2878694534301758,
	"reward_std": 0.5034958124160767,
	"rewards/reward_model/mean": 1.2878694534301758,
	"rewards/reward_model/std": 0.8000524640083313,
	"step": 94,
	"step_time": 168.66197129152715
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.146484375,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 82.29296875,
	"completions/mean_terminated_length": 74.4485092163086,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"entropy": 1.2125889593735337,
	"epoch": 0.039126853377265236,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5775403380393982,
	"kl": 0.17348909995052963,
	"learning_rate": 1.160493827160494e-06,
	"loss": 0.0291,
	"num_tokens": 43925606.0,
	"reward": 1.3064830303192139,
	"reward_std": 0.5317621231079102,
	"rewards/reward_model/mean": 1.3064830303192139,
	"rewards/reward_model/std": 0.8767746090888977,
	"step": 95,
	"step_time": 168.66238435404375
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.11376953125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 83.8466796875,
	"completions/mean_terminated_length": 78.17851257324219,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"entropy": 1.2149158080574125,
	"epoch": 0.039538714991762765,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5917447209358215,
	"kl": 0.16629101379658096,
	"learning_rate": 1.1728395061728396e-06,
	"loss": 0.0372,
	"num_tokens": 44365228.0,
	"reward": 1.310151219367981,
	"reward_std": 0.5394536852836609,
	"rewards/reward_model/mean": 1.310151219367981,
	"rewards/reward_model/std": 0.8472654223442078,
	"step": 96,
	"step_time": 170.65063601452857
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.087890625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 81.22119140625,
	"completions/mean_terminated_length": 76.7136001586914,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"entropy": 1.2331861625425518,
	"epoch": 0.039950576606260293,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6160045266151428,
	"kl": 0.18450818251585588,
	"learning_rate": 1.1851851851851852e-06,
	"loss": 0.0239,
	"num_tokens": 44809841.0,
	"reward": 1.388469934463501,
	"reward_std": 0.47841960191726685,
	"rewards/reward_model/mean": 1.388469934463501,
	"rewards/reward_model/std": 0.7604539394378662,
	"step": 97,
	"step_time": 168.5139070255682
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.14404296875,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 86.03857421875,
	"completions/mean_terminated_length": 78.97718811035156,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"entropy": 1.2734931902959943,
	"epoch": 0.04036243822075782,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.8496507406234741,
	"kl": 0.1486935554712545,
	"learning_rate": 1.197530864197531e-06,
	"loss": 0.024,
	"num_tokens": 45291392.0,
	"reward": 1.273500680923462,
	"reward_std": 0.5131819844245911,
	"rewards/reward_model/mean": 1.273500680923462,
	"rewards/reward_model/std": 0.8183842301368713,
	"step": 98,
	"step_time": 169.52931605745107
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.07861328125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 76.30078125,
	"completions/mean_terminated_length": 71.8897705078125,
	"completions/min_length": 4.0,
	"completions/min_terminated_length": 4.0,
	"entropy": 1.210193380014971,
	"epoch": 0.04077429983525535,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.6533998847007751,
	"kl": 0.16968276555417106,
	"learning_rate": 1.2098765432098765e-06,
	"loss": 0.0227,
	"num_tokens": 45769608.0,
	"reward": 1.374595046043396,
	"reward_std": 0.5025352835655212,
	"rewards/reward_model/mean": 1.374595046043396,
	"rewards/reward_model/std": 0.7355093359947205,
	"step": 99,
	"step_time": 170.76728575211018
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.10595703125,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 80.27490234375,
	"completions/mean_terminated_length": 74.61878204345703,
	"completions/min_length": 12.0,
	"completions/min_terminated_length": 12.0,
	"entropy": 1.1873215795494616,
	"epoch": 0.04118616144975288,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.5959328413009644,
	"kl": 0.14610896681551822,
	"learning_rate": 1.2222222222222221e-06,
	"loss": 0.0218,
	"num_tokens": 46193339.0,
	"reward": 1.393322229385376,
	"reward_std": 0.5245035886764526,
	"rewards/reward_model/mean": 1.393322229385376,
	"rewards/reward_model/std": 0.8767962455749512,
	"step": 100,
	"step_time": 169.11898464756086
	}
	],
	"logging_steps": 1,
	"max_steps": 2428,
	"num_input_tokens_seen": 46193339,
	"num_train_epochs": 1,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 1,
	"trial_name": null,
	"trial_params": null
	}