bbox_model_step_200 / trainer_state.json

Upload folder using huggingface_hub

994aecd verified about 1 month ago

191 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 100.0,
	"eval_steps": 50,
	"global_step": 200,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.203125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 565.0,
	"completions/mean_length": 382.40625,
	"completions/mean_terminated_length": 218.86274509803923,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 0.5,
	"grad_norm": 7.220178667921573e-05,
	"kl": 0.0,
	"learning_rate": 0.0,
	"loss": 0.0008,
	"num_tokens": 44698.0,
	"reward": 0.616805911064148,
	"reward_std": 0.03843851387500763,
	"rewards/reward_matching": 0.5590299367904663,
	"rewards/reward_object_count": 0.40693962574005127,
	"rewards/reward_parseable": 1.0,
	"step": 1
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.09375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 919.0,
	"completions/mean_length": 284.40625,
	"completions/mean_terminated_length": 207.89655172413794,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 1.0,
	"grad_norm": 9.472812234889716e-05,
	"kl": 0.0,
	"learning_rate": 1e-06,
	"loss": 0.0009,
	"num_tokens": 84084.0,
	"reward": 0.5493605732917786,
	"reward_std": 0.05996648967266083,
	"rewards/reward_matching": 0.4322524964809418,
	"rewards/reward_object_count": 0.4500454068183899,
	"rewards/reward_parseable": 1.0,
	"step": 2
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 693.0,
	"completions/mean_length": 319.875,
	"completions/mean_terminated_length": 157.3846153846154,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 1.5,
	"grad_norm": 9.924145706463605e-05,
	"kl": 0.00042467157436476555,
	"learning_rate": 2e-06,
	"loss": 0.0015,
	"num_tokens": 123180.0,
	"reward": 0.5689160823822021,
	"reward_std": 0.04476189613342285,
	"rewards/reward_matching": 0.47214236855506897,
	"rewards/reward_object_count": 0.4281533360481262,
	"rewards/reward_parseable": 1.0,
	"step": 3
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 834.0,
	"completions/mean_length": 308.015625,
	"completions/mean_terminated_length": 142.78846153846155,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 2.0,
	"grad_norm": 5.0325375923421234e-05,
	"kl": 0.0004158883857598994,
	"learning_rate": 3e-06,
	"loss": 0.0005,
	"num_tokens": 165037.0,
	"reward": 0.6509107351303101,
	"reward_std": 0.03968992456793785,
	"rewards/reward_matching": 0.5518875122070312,
	"rewards/reward_object_count": 0.5988913774490356,
	"rewards/reward_parseable": 1.0,
	"step": 4
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 997.0,
	"completions/mean_length": 269.28125,
	"completions/mean_terminated_length": 244.93548387096774,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 2.5,
	"grad_norm": 0.00011539691331563517,
	"kl": 0.0006114643038017675,
	"learning_rate": 4e-06,
	"loss": 0.0016,
	"num_tokens": 201535.0,
	"reward": 0.6278109550476074,
	"reward_std": 0.05065031349658966,
	"rewards/reward_matching": 0.5660988092422485,
	"rewards/reward_object_count": 0.44075822830200195,
	"rewards/reward_parseable": 1.0,
	"step": 5
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.15625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 919.0,
	"completions/mean_length": 353.546875,
	"completions/mean_terminated_length": 229.38888888888889,
	"completions/min_length": 24.0,
	"completions/min_terminated_length": 24.0,
	"epoch": 3.0,
	"grad_norm": 0.0003470871888566762,
	"kl": 0.00038929956645006314,
	"learning_rate": 4.9999999999999996e-06,
	"loss": -0.0011,
	"num_tokens": 241506.0,
	"reward": 0.5265649557113647,
	"reward_std": 0.08980950713157654,
	"rewards/reward_matching": 0.4059111475944519,
	"rewards/reward_object_count": 0.4463413953781128,
	"rewards/reward_parseable": 0.96875,
	"step": 6
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.296875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 931.0,
	"completions/mean_length": 455.703125,
	"completions/mean_terminated_length": 215.75555555555556,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 3.5,
	"grad_norm": 0.00015567304217256606,
	"kl": 0.0004744630350614898,
	"learning_rate": 6e-06,
	"loss": 0.0018,
	"num_tokens": 290575.0,
	"reward": 0.5346964001655579,
	"reward_std": 0.08377400040626526,
	"rewards/reward_matching": 0.43381091952323914,
	"rewards/reward_object_count": 0.38767415285110474,
	"rewards/reward_parseable": 0.984375,
	"step": 7
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 890.0,
	"completions/mean_length": 494.71875,
	"completions/mean_terminated_length": 254.13636363636363,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 4.0,
	"grad_norm": 9.709381993161514e-05,
	"kl": 0.0003735233622137457,
	"learning_rate": 7e-06,
	"loss": 0.0015,
	"num_tokens": 343741.0,
	"reward": 0.5899163484573364,
	"reward_std": 0.03980647027492523,
	"rewards/reward_matching": 0.5186960697174072,
	"rewards/reward_object_count": 0.3934932053089142,
	"rewards/reward_parseable": 1.0,
	"step": 8
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 301.0,
	"completions/max_terminated_length": 301.0,
	"completions/mean_length": 78.6875,
	"completions/mean_terminated_length": 78.6875,
	"completions/min_length": 26.0,
	"completions/min_terminated_length": 26.0,
	"epoch": 4.5,
	"grad_norm": 0.00030578242149204016,
	"kl": 0.0012195941890240647,
	"learning_rate": 8e-06,
	"loss": -0.001,
	"num_tokens": 370601.0,
	"reward": 0.6947240829467773,
	"reward_std": 0.09824053198099136,
	"rewards/reward_matching": 0.5668145418167114,
	"rewards/reward_object_count": 0.8044270873069763,
	"rewards/reward_parseable": 0.96875,
	"step": 9
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.109375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 988.0,
	"completions/mean_length": 223.171875,
	"completions/mean_terminated_length": 124.82456140350877,
	"completions/min_length": 22.0,
	"completions/min_terminated_length": 22.0,
	"epoch": 5.0,
	"grad_norm": 0.00012970698298886418,
	"kl": 0.0004933492928103078,
	"learning_rate": 9e-06,
	"loss": 0.0004,
	"num_tokens": 406388.0,
	"reward": 0.6668163537979126,
	"reward_std": 0.056093666702508926,
	"rewards/reward_matching": 0.5769085884094238,
	"rewards/reward_object_count": 0.6189813613891602,
	"rewards/reward_parseable": 0.984375,
	"step": 10
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 271.0,
	"completions/mean_length": 123.21875,
	"completions/mean_terminated_length": 78.91803278688525,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 5.5,
	"grad_norm": 0.00010655791993485764,
	"kl": 0.0008107178764475975,
	"learning_rate": 9.999999999999999e-06,
	"loss": 0.0002,
	"num_tokens": 435458.0,
	"reward": 0.56708824634552,
	"reward_std": 0.10232001543045044,
	"rewards/reward_matching": 0.419674813747406,
	"rewards/reward_object_count": 0.5920416116714478,
	"rewards/reward_parseable": 0.984375,
	"step": 11
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.15625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 505.0,
	"completions/mean_length": 278.671875,
	"completions/mean_terminated_length": 140.64814814814815,
	"completions/min_length": 32.0,
	"completions/min_terminated_length": 32.0,
	"epoch": 6.0,
	"grad_norm": 6.27185363555327e-05,
	"kl": 0.000603774591581896,
	"learning_rate": 1.1e-05,
	"loss": 0.0013,
	"num_tokens": 474797.0,
	"reward": 0.6307092308998108,
	"reward_std": 0.042993463575839996,
	"rewards/reward_matching": 0.5698345899581909,
	"rewards/reward_object_count": 0.4440425634384155,
	"rewards/reward_parseable": 1.0,
	"step": 12
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 896.0,
	"completions/mean_length": 519.484375,
	"completions/mean_terminated_length": 290.15909090909093,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 6.5,
	"grad_norm": 0.0002488858881406486,
	"kl": 0.00022653781888948288,
	"learning_rate": 1.2e-05,
	"loss": -0.0021,
	"num_tokens": 529548.0,
	"reward": 0.5722981691360474,
	"reward_std": 0.0947316363453865,
	"rewards/reward_matching": 0.5071631073951721,
	"rewards/reward_object_count": 0.3868764042854309,
	"rewards/reward_parseable": 0.953125,
	"step": 13
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 436.0,
	"completions/mean_length": 136.0625,
	"completions/mean_terminated_length": 107.41935483870968,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 7.0,
	"grad_norm": 4.9686714191921055e-05,
	"kl": 0.0007801693172950763,
	"learning_rate": 1.3000000000000001e-05,
	"loss": 0.0002,
	"num_tokens": 556880.0,
	"reward": 0.6056115031242371,
	"reward_std": 0.05535212904214859,
	"rewards/reward_matching": 0.5002501010894775,
	"rewards/reward_object_count": 0.5273073315620422,
	"rewards/reward_parseable": 1.0,
	"step": 14
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 865.0,
	"completions/mean_length": 477.109375,
	"completions/mean_terminated_length": 228.52272727272728,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 7.5,
	"grad_norm": 0.00013081179349683225,
	"kl": 0.0002415532035229262,
	"learning_rate": 1.4e-05,
	"loss": 0.0011,
	"num_tokens": 602839.0,
	"reward": 0.5026674270629883,
	"reward_std": 0.07394808530807495,
	"rewards/reward_matching": 0.38987797498703003,
	"rewards/reward_object_count": 0.359328031539917,
	"rewards/reward_parseable": 0.984375,
	"step": 15
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 238.0,
	"completions/max_terminated_length": 238.0,
	"completions/mean_length": 86.21875,
	"completions/mean_terminated_length": 86.21875,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 8.0,
	"grad_norm": 0.00016548681014683098,
	"kl": 0.0009981263428926468,
	"learning_rate": 1.5e-05,
	"loss": -0.0007,
	"num_tokens": 629221.0,
	"reward": 0.6400452852249146,
	"reward_std": 0.07901425659656525,
	"rewards/reward_matching": 0.49512743949890137,
	"rewards/reward_object_count": 0.73046875,
	"rewards/reward_parseable": 0.984375,
	"step": 16
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.171875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 970.0,
	"completions/mean_length": 356.625,
	"completions/mean_terminated_length": 218.11320754716982,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 8.5,
	"grad_norm": 0.0001052798397722654,
	"kl": 0.000348803438100731,
	"learning_rate": 1.6e-05,
	"loss": 0.0017,
	"num_tokens": 670349.0,
	"reward": 0.5609536170959473,
	"reward_std": 0.05881837010383606,
	"rewards/reward_matching": 0.4391196370124817,
	"rewards/reward_object_count": 0.48740923404693604,
	"rewards/reward_parseable": 1.0,
	"step": 17
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 238.0,
	"completions/max_terminated_length": 238.0,
	"completions/mean_length": 104.28125,
	"completions/mean_terminated_length": 104.28125,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 9.0,
	"grad_norm": 0.0001435808662790805,
	"kl": 0.0010840660106623545,
	"learning_rate": 1.7e-05,
	"loss": -0.0006,
	"num_tokens": 696927.0,
	"reward": 0.6028817892074585,
	"reward_std": 0.08744296431541443,
	"rewards/reward_matching": 0.5085169672966003,
	"rewards/reward_object_count": 0.5044828653335571,
	"rewards/reward_parseable": 0.984375,
	"step": 18
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.203125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 634.0,
	"completions/mean_length": 341.03125,
	"completions/mean_terminated_length": 166.94117647058823,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 9.5,
	"grad_norm": 7.831490802345797e-05,
	"kl": 0.0003636158817243995,
	"learning_rate": 1.8e-05,
	"loss": 0.0016,
	"num_tokens": 736417.0,
	"reward": 0.5801401138305664,
	"reward_std": 0.04980730637907982,
	"rewards/reward_matching": 0.4626754820346832,
	"rewards/reward_object_count": 0.5126744508743286,
	"rewards/reward_parseable": 1.0,
	"step": 19
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 667.0,
	"completions/mean_length": 283.40625,
	"completions/mean_terminated_length": 112.5,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 10.0,
	"grad_norm": 5.31727600900922e-05,
	"kl": 0.0005135804531164467,
	"learning_rate": 1.9e-05,
	"loss": 0.0006,
	"num_tokens": 773499.0,
	"reward": 0.6203708648681641,
	"reward_std": 0.0496257059276104,
	"rewards/reward_matching": 0.4899587631225586,
	"rewards/reward_object_count": 0.631977915763855,
	"rewards/reward_parseable": 1.0,
	"step": 20
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.109375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 1006.0,
	"completions/mean_length": 231.671875,
	"completions/mean_terminated_length": 134.3684210526316,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 10.5,
	"grad_norm": 5.5479034926975146e-05,
	"kl": 0.000991112490737578,
	"learning_rate": 1.9999999999999998e-05,
	"loss": 0.0003,
	"num_tokens": 808230.0,
	"reward": 0.6594911217689514,
	"reward_std": 0.044610194861888885,
	"rewards/reward_matching": 0.543105959892273,
	"rewards/reward_object_count": 0.668137788772583,
	"rewards/reward_parseable": 1.0,
	"step": 21
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 855.0,
	"completions/mean_length": 272.296875,
	"completions/mean_terminated_length": 164.91071428571428,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 11.0,
	"grad_norm": 8.819045615382493e-05,
	"kl": 0.0005528706569748465,
	"learning_rate": 2.1e-05,
	"loss": 0.0003,
	"num_tokens": 843961.0,
	"reward": 0.5963397026062012,
	"reward_std": 0.07382210344076157,
	"rewards/reward_matching": 0.45430704951286316,
	"rewards/reward_object_count": 0.6344020962715149,
	"rewards/reward_parseable": 0.984375,
	"step": 22
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.09375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 600.0,
	"completions/mean_length": 264.21875,
	"completions/mean_terminated_length": 185.6206896551724,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 11.5,
	"grad_norm": 0.0001272808003704995,
	"kl": 0.0010251196290482767,
	"learning_rate": 2.2e-05,
	"loss": 0.0015,
	"num_tokens": 878215.0,
	"reward": 0.5910627841949463,
	"reward_std": 0.07647538185119629,
	"rewards/reward_matching": 0.4740520715713501,
	"rewards/reward_object_count": 0.5331577062606812,
	"rewards/reward_parseable": 1.0,
	"step": 23
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.234375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 1001.0,
	"completions/mean_length": 437.671875,
	"completions/mean_terminated_length": 258.18367346938777,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 12.0,
	"grad_norm": 0.00030737402266822755,
	"kl": 0.0005441164066724014,
	"learning_rate": 2.3000000000000003e-05,
	"loss": -0.0008,
	"num_tokens": 928690.0,
	"reward": 0.4855830669403076,
	"reward_std": 0.11943839490413666,
	"rewards/reward_matching": 0.4223440885543823,
	"rewards/reward_object_count": 0.20775802433490753,
	"rewards/reward_parseable": 0.953125,
	"step": 24
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 919.0,
	"completions/mean_length": 425.546875,
	"completions/mean_terminated_length": 287.4423076923077,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 12.5,
	"grad_norm": 0.0001554730988573283,
	"kl": 0.0006814353218942415,
	"learning_rate": 2.4e-05,
	"loss": 0.0031,
	"num_tokens": 973589.0,
	"reward": 0.5343494415283203,
	"reward_std": 0.07961155474185944,
	"rewards/reward_matching": 0.44471532106399536,
	"rewards/reward_object_count": 0.3532262444496155,
	"rewards/reward_parseable": 0.984375,
	"step": 25
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.09375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 957.0,
	"completions/mean_length": 217.65625,
	"completions/mean_terminated_length": 134.24137931034483,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 13.0,
	"grad_norm": 0.0002844391274265945,
	"kl": 0.0010056742103188299,
	"learning_rate": 2.5e-05,
	"loss": -0.0027,
	"num_tokens": 1005823.0,
	"reward": 0.5949710011482239,
	"reward_std": 0.09501722455024719,
	"rewards/reward_matching": 0.4553234577178955,
	"rewards/reward_object_count": 0.6557595729827881,
	"rewards/reward_parseable": 0.953125,
	"step": 26
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.09375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 436.0,
	"completions/mean_length": 212.03125,
	"completions/mean_terminated_length": 128.0344827586207,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 13.5,
	"grad_norm": 7.75867301854305e-05,
	"kl": 0.0008590275319875218,
	"learning_rate": 2.6000000000000002e-05,
	"loss": 0.0016,
	"num_tokens": 1037057.0,
	"reward": 0.5786451101303101,
	"reward_std": 0.05433168634772301,
	"rewards/reward_matching": 0.453294575214386,
	"rewards/reward_object_count": 0.5333421230316162,
	"rewards/reward_parseable": 1.0,
	"step": 27
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.171875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 766.0,
	"completions/mean_length": 299.171875,
	"completions/mean_terminated_length": 148.73584905660377,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 14.0,
	"grad_norm": 0.00023781158961355686,
	"kl": 0.00094783199892845,
	"learning_rate": 2.7000000000000002e-05,
	"loss": -0.0002,
	"num_tokens": 1078668.0,
	"reward": 0.5741347074508667,
	"reward_std": 0.08269049972295761,
	"rewards/reward_matching": 0.5097981691360474,
	"rewards/reward_object_count": 0.37252914905548096,
	"rewards/reward_parseable": 0.96875,
	"step": 28
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.171875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 972.0,
	"completions/mean_length": 259.09375,
	"completions/mean_terminated_length": 100.33962264150944,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 14.5,
	"grad_norm": 6.763833516743034e-05,
	"kl": 0.0008891169927665032,
	"learning_rate": 2.8e-05,
	"loss": -0.0,
	"num_tokens": 1117394.0,
	"reward": 0.6306777000427246,
	"reward_std": 0.04761374741792679,
	"rewards/reward_matching": 0.5269917249679565,
	"rewards/reward_object_count": 0.5724132061004639,
	"rewards/reward_parseable": 1.0,
	"step": 29
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 524.0,
	"completions/mean_length": 183.71875,
	"completions/mean_terminated_length": 170.38095238095238,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 15.0,
	"grad_norm": 6.627541006309912e-05,
	"kl": 0.0015845489542698488,
	"learning_rate": 2.9e-05,
	"loss": 0.0008,
	"num_tokens": 1149696.0,
	"reward": 0.6976220011711121,
	"reward_std": 0.04097198694944382,
	"rewards/reward_matching": 0.6446974277496338,
	"rewards/reward_object_count": 0.5540179014205933,
	"rewards/reward_parseable": 1.0,
	"step": 30
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 769.0,
	"completions/mean_length": 240.78125,
	"completions/mean_terminated_length": 202.2622950819672,
	"completions/min_length": 70.0,
	"completions/min_terminated_length": 70.0,
	"epoch": 15.5,
	"grad_norm": 0.00014589792408514768,
	"kl": 0.0013277196849230677,
	"learning_rate": 3e-05,
	"loss": 0.0009,
	"num_tokens": 1185330.0,
	"reward": 0.5496550798416138,
	"reward_std": 0.06000591441988945,
	"rewards/reward_matching": 0.4662263095378876,
	"rewards/reward_object_count": 0.3495963215827942,
	"rewards/reward_parseable": 1.0,
	"step": 31
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 273.0,
	"completions/mean_length": 96.34375,
	"completions/mean_terminated_length": 81.61904761904762,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 16.0,
	"grad_norm": 5.5187963880598545e-05,
	"kl": 0.004379737103590742,
	"learning_rate": 2.999992132854894e-05,
	"loss": 0.0004,
	"num_tokens": 1209160.0,
	"reward": 0.6008960604667664,
	"reward_std": 0.05777881667017937,
	"rewards/reward_matching": 0.43078088760375977,
	"rewards/reward_object_count": 0.7121376991271973,
	"rewards/reward_parseable": 1.0,
	"step": 32
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 387.0,
	"completions/max_terminated_length": 387.0,
	"completions/mean_length": 117.546875,
	"completions/mean_terminated_length": 117.546875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 16.5,
	"grad_norm": 6.276514614000916e-05,
	"kl": 0.003105600946582854,
	"learning_rate": 2.999968531502098e-05,
	"loss": -0.0,
	"num_tokens": 1237867.0,
	"reward": 0.5474408268928528,
	"reward_std": 0.048626385629177094,
	"rewards/reward_matching": 0.4194202125072479,
	"rewards/reward_object_count": 0.47894346714019775,
	"rewards/reward_parseable": 1.0,
	"step": 33
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.09375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 996.0,
	"completions/mean_length": 274.921875,
	"completions/mean_terminated_length": 197.43103448275863,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 17.0,
	"grad_norm": 8.7667656771373e-05,
	"kl": 0.0016155529519892298,
	"learning_rate": 2.99992919618918e-05,
	"loss": 0.0013,
	"num_tokens": 1273126.0,
	"reward": 0.616933286190033,
	"reward_std": 0.06678177416324615,
	"rewards/reward_matching": 0.48520591855049133,
	"rewards/reward_object_count": 0.6290486454963684,
	"rewards/reward_parseable": 1.0,
	"step": 34
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.21875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 832.0,
	"completions/mean_length": 384.578125,
	"completions/mean_terminated_length": 205.54,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 17.5,
	"grad_norm": 0.00012024820898659527,
	"kl": 0.0013034686198807321,
	"learning_rate": 2.999874127328748e-05,
	"loss": -0.0012,
	"num_tokens": 1315403.0,
	"reward": 0.549712061882019,
	"reward_std": 0.06744106113910675,
	"rewards/reward_matching": 0.44805556535720825,
	"rewards/reward_object_count": 0.43564340472221375,
	"rewards/reward_parseable": 0.96875,
	"step": 35
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 403.0,
	"completions/mean_length": 145.71875,
	"completions/mean_terminated_length": 131.77777777777777,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 18.0,
	"grad_norm": 8.172642992576584e-05,
	"kl": 0.004074121621670201,
	"learning_rate": 2.9998033254984483e-05,
	"loss": -0.0004,
	"num_tokens": 1341753.0,
	"reward": 0.5836251974105835,
	"reward_std": 0.06786907464265823,
	"rewards/reward_matching": 0.478513240814209,
	"rewards/reward_object_count": 0.4982115924358368,
	"rewards/reward_parseable": 0.984375,
	"step": 36
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.28125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 929.0,
	"completions/mean_length": 419.25,
	"completions/mean_terminated_length": 182.6086956521739,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 18.5,
	"grad_norm": 0.00017777887114789337,
	"kl": 0.0012582261915667914,
	"learning_rate": 2.999716791440959e-05,
	"loss": -0.0007,
	"num_tokens": 1388809.0,
	"reward": 0.533015251159668,
	"reward_std": 0.10117805004119873,
	"rewards/reward_matching": 0.4379459619522095,
	"rewards/reward_object_count": 0.38248807191848755,
	"rewards/reward_parseable": 0.96875,
	"step": 37
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.140625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 895.0,
	"completions/mean_length": 339.390625,
	"completions/mean_terminated_length": 227.36363636363637,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 19.0,
	"grad_norm": 0.00014843710232526064,
	"kl": 0.002032484859228134,
	"learning_rate": 2.9996145260639812e-05,
	"loss": 0.0027,
	"num_tokens": 1429474.0,
	"reward": 0.5366367697715759,
	"reward_std": 0.09246323257684708,
	"rewards/reward_matching": 0.4080054759979248,
	"rewards/reward_object_count": 0.47479236125946045,
	"rewards/reward_parseable": 0.984375,
	"step": 38
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.1875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 969.0,
	"completions/mean_length": 370.875,
	"completions/mean_terminated_length": 220.15384615384616,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 19.5,
	"grad_norm": 0.00011248727241763845,
	"kl": 0.0019941654754802585,
	"learning_rate": 2.9994965304402304e-05,
	"loss": 0.0011,
	"num_tokens": 1470554.0,
	"reward": 0.6027147769927979,
	"reward_std": 0.06692709028720856,
	"rewards/reward_matching": 0.4802256226539612,
	"rewards/reward_object_count": 0.5728965997695923,
	"rewards/reward_parseable": 1.0,
	"step": 39
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 999.0,
	"completions/mean_length": 261.140625,
	"completions/mean_terminated_length": 152.16071428571428,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 20.0,
	"grad_norm": 7.97496541053988e-05,
	"kl": 0.0031269127648556605,
	"learning_rate": 2.999362805807425e-05,
	"loss": 0.0015,
	"num_tokens": 1509091.0,
	"reward": 0.6954025030136108,
	"reward_std": 0.032408565282821655,
	"rewards/reward_matching": 0.6296951770782471,
	"rewards/reward_object_count": 0.587926983833313,
	"rewards/reward_parseable": 1.0,
	"step": 40
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 337.0,
	"completions/mean_length": 104.671875,
	"completions/mean_terminated_length": 90.07936507936508,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 20.5,
	"grad_norm": 7.970151636982337e-05,
	"kl": 0.0068569304421544075,
	"learning_rate": 2.9992133535682725e-05,
	"loss": -0.0001,
	"num_tokens": 1537934.0,
	"reward": 0.6217859983444214,
	"reward_std": 0.04923363775014877,
	"rewards/reward_matching": 0.4929783046245575,
	"rewards/reward_object_count": 0.6299948692321777,
	"rewards/reward_parseable": 1.0,
	"step": 41
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.109375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 601.0,
	"completions/mean_length": 261.578125,
	"completions/mean_terminated_length": 167.94736842105263,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 21.0,
	"grad_norm": 0.0001107210700865835,
	"kl": 0.002925441396655515,
	"learning_rate": 2.9990481752904566e-05,
	"loss": 0.0001,
	"num_tokens": 1572659.0,
	"reward": 0.5629563331604004,
	"reward_std": 0.059148214757442474,
	"rewards/reward_matching": 0.49224740266799927,
	"rewards/reward_object_count": 0.35366469621658325,
	"rewards/reward_parseable": 0.984375,
	"step": 42
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.15625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 964.0,
	"completions/mean_length": 428.6875,
	"completions/mean_terminated_length": 318.44444444444446,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 21.5,
	"grad_norm": 8.503717253915966e-05,
	"kl": 0.002438544644974172,
	"learning_rate": 2.9988672727066197e-05,
	"loss": 0.0011,
	"num_tokens": 1618719.0,
	"reward": 0.5435788631439209,
	"reward_std": 0.05614367127418518,
	"rewards/reward_matching": 0.45421895384788513,
	"rewards/reward_object_count": 0.3552376627922058,
	"rewards/reward_parseable": 1.0,
	"step": 43
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 425.0,
	"completions/max_terminated_length": 425.0,
	"completions/mean_length": 129.625,
	"completions/mean_terminated_length": 129.625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 22.0,
	"grad_norm": 0.00011053122580051422,
	"kl": 0.005899429728742689,
	"learning_rate": 2.9986706477143436e-05,
	"loss": -0.0004,
	"num_tokens": 1646919.0,
	"reward": 0.6216758489608765,
	"reward_std": 0.07677946984767914,
	"rewards/reward_matching": 0.5217663645744324,
	"rewards/reward_object_count": 0.5430803298950195,
	"rewards/reward_parseable": 1.0,
	"step": 44
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.3125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 601.0,
	"completions/mean_length": 424.140625,
	"completions/mean_terminated_length": 151.47727272727272,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 22.5,
	"grad_norm": 0.000124470898299478,
	"kl": 0.0022016632283339277,
	"learning_rate": 2.9984583023761318e-05,
	"loss": 0.002,
	"num_tokens": 1696528.0,
	"reward": 0.5888717174530029,
	"reward_std": 0.041133634746074677,
	"rewards/reward_matching": 0.5104244947433472,
	"rewards/reward_object_count": 0.41308486461639404,
	"rewards/reward_parseable": 1.0,
	"step": 45
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.109375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 957.0,
	"completions/mean_length": 330.796875,
	"completions/mean_terminated_length": 245.66666666666666,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 23.0,
	"grad_norm": 0.00011844925757031888,
	"kl": 0.0031491442350670695,
	"learning_rate": 2.998230238919386e-05,
	"loss": 0.0021,
	"num_tokens": 1733763.0,
	"reward": 0.5587866902351379,
	"reward_std": 0.05619703605771065,
	"rewards/reward_matching": 0.42642295360565186,
	"rewards/reward_object_count": 0.5146645307540894,
	"rewards/reward_parseable": 1.0,
	"step": 46
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 236.0,
	"completions/mean_length": 117.53125,
	"completions/mean_terminated_length": 88.29032258064517,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 23.5,
	"grad_norm": 0.0001667703763814643,
	"kl": 0.007947034202516079,
	"learning_rate": 2.9979864597363846e-05,
	"loss": 0.0001,
	"num_tokens": 1765349.0,
	"reward": 0.7034145593643188,
	"reward_std": 0.08120490610599518,
	"rewards/reward_matching": 0.6374689340591431,
	"rewards/reward_object_count": 0.6202906370162964,
	"rewards/reward_parseable": 0.984375,
	"step": 47
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 849.0,
	"completions/mean_length": 246.625,
	"completions/mean_terminated_length": 194.8,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 24.0,
	"grad_norm": 0.00029066766728647053,
	"kl": 0.00509595571202226,
	"learning_rate": 2.9977269673842554e-05,
	"loss": -0.0019,
	"num_tokens": 1801037.0,
	"reward": 0.616760790348053,
	"reward_std": 0.10064470022916794,
	"rewards/reward_matching": 0.5000989437103271,
	"rewards/reward_object_count": 0.6303821802139282,
	"rewards/reward_parseable": 0.953125,
	"step": 48
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.078125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 539.0,
	"completions/mean_length": 208.234375,
	"completions/mean_terminated_length": 139.10169491525423,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 24.5,
	"grad_norm": 0.0001182894702651538,
	"kl": 0.005687805562047288,
	"learning_rate": 2.997451764584951e-05,
	"loss": 0.0004,
	"num_tokens": 1836188.0,
	"reward": 0.5805012583732605,
	"reward_std": 0.06066010519862175,
	"rewards/reward_matching": 0.4713587164878845,
	"rewards/reward_object_count": 0.48843005299568176,
	"rewards/reward_parseable": 1.0,
	"step": 49
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 311.0,
	"completions/mean_length": 139.796875,
	"completions/mean_terminated_length": 125.76190476190476,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 25.0,
	"grad_norm": 5.7654466218082234e-05,
	"kl": 0.006528859958052635,
	"learning_rate": 2.9971608542252175e-05,
	"loss": 0.0008,
	"num_tokens": 1866319.0,
	"reward": 0.6885979175567627,
	"reward_std": 0.039257895201444626,
	"rewards/reward_matching": 0.5929476618766785,
	"rewards/reward_object_count": 0.6641466617584229,
	"rewards/reward_parseable": 1.0,
	"step": 50
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 821.0,
	"completions/mean_length": 204.296875,
	"completions/mean_terminated_length": 163.98360655737704,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 25.5,
	"grad_norm": 7.800667663104832e-05,
	"kl": 0.0062508382252417505,
	"learning_rate": 2.9968542393565674e-05,
	"loss": 0.0006,
	"num_tokens": 1897058.0,
	"reward": 0.6192189455032349,
	"reward_std": 0.052703116089105606,
	"rewards/reward_matching": 0.5146477818489075,
	"rewards/reward_object_count": 0.5521511435508728,
	"rewards/reward_parseable": 1.0,
	"step": 51
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.15625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 858.0,
	"completions/mean_length": 370.859375,
	"completions/mean_terminated_length": 249.90740740740742,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 26.0,
	"grad_norm": 7.006085070315748e-05,
	"kl": 0.00459836726076901,
	"learning_rate": 2.996531923195246e-05,
	"loss": 0.0008,
	"num_tokens": 1939097.0,
	"reward": 0.5544993877410889,
	"reward_std": 0.04202552139759064,
	"rewards/reward_matching": 0.4601183831691742,
	"rewards/reward_object_count": 0.3921419382095337,
	"rewards/reward_parseable": 1.0,
	"step": 52
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.203125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 1008.0,
	"completions/mean_length": 561.46875,
	"completions/mean_terminated_length": 443.5686274509804,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 26.5,
	"grad_norm": 0.00015066047490108758,
	"kl": 0.003497203520964831,
	"learning_rate": 2.996193909122197e-05,
	"loss": 0.0023,
	"num_tokens": 1993335.0,
	"reward": 0.5232954025268555,
	"reward_std": 0.05474445968866348,
	"rewards/reward_matching": 0.45413488149642944,
	"rewards/reward_object_count": 0.2540724277496338,
	"rewards/reward_parseable": 1.0,
	"step": 53
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 349.0,
	"completions/max_terminated_length": 349.0,
	"completions/mean_length": 104.953125,
	"completions/mean_terminated_length": 104.953125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 27.0,
	"grad_norm": 0.00012995678116567433,
	"kl": 0.00867222691886127,
	"learning_rate": 2.995840200683028e-05,
	"loss": -0.0004,
	"num_tokens": 2020596.0,
	"reward": 0.6695042848587036,
	"reward_std": 0.08031092584133148,
	"rewards/reward_matching": 0.522784948348999,
	"rewards/reward_object_count": 0.7791666984558105,
	"rewards/reward_parseable": 1.0,
	"step": 54
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 731.0,
	"completions/mean_length": 191.703125,
	"completions/mean_terminated_length": 178.4920634920635,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 27.5,
	"grad_norm": 0.00017028290312737226,
	"kl": 0.006999723031185567,
	"learning_rate": 2.995470801587973e-05,
	"loss": 0.0004,
	"num_tokens": 2054049.0,
	"reward": 0.6205588579177856,
	"reward_std": 0.10116489231586456,
	"rewards/reward_matching": 0.5161659717559814,
	"rewards/reward_object_count": 0.5699214935302734,
	"rewards/reward_parseable": 0.984375,
	"step": 55
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 159.0,
	"completions/max_terminated_length": 159.0,
	"completions/mean_length": 68.734375,
	"completions/mean_terminated_length": 68.734375,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 28.0,
	"grad_norm": 6.421493890229613e-05,
	"kl": 0.012037722510285676,
	"learning_rate": 2.9950857157118544e-05,
	"loss": -0.0,
	"num_tokens": 2078032.0,
	"reward": 0.7118315696716309,
	"reward_std": 0.04499781131744385,
	"rewards/reward_matching": 0.5969762802124023,
	"rewards/reward_object_count": 0.7682291865348816,
	"rewards/reward_parseable": 1.0,
	"step": 56
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 997.0,
	"completions/mean_length": 200.46875,
	"completions/mean_terminated_length": 173.90322580645162,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 28.5,
	"grad_norm": 5.707196032744832e-05,
	"kl": 0.007291222224012017,
	"learning_rate": 2.9946849470940395e-05,
	"loss": 0.0002,
	"num_tokens": 2108526.0,
	"reward": 0.6197296380996704,
	"reward_std": 0.03817511722445488,
	"rewards/reward_matching": 0.5467733144760132,
	"rewards/reward_object_count": 0.4583281874656677,
	"rewards/reward_parseable": 1.0,
	"step": 57
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 463.0,
	"completions/mean_length": 168.453125,
	"completions/mean_terminated_length": 140.8548387096774,
	"completions/min_length": 25.0,
	"completions/min_terminated_length": 25.0,
	"epoch": 29.0,
	"grad_norm": 0.00027134420815855265,
	"kl": 0.009315767034422606,
	"learning_rate": 2.9942684999384034e-05,
	"loss": -0.0006,
	"num_tokens": 2139531.0,
	"reward": 0.5399819612503052,
	"reward_std": 0.0870380625128746,
	"rewards/reward_matching": 0.4160889685153961,
	"rewards/reward_object_count": 0.46726763248443604,
	"rewards/reward_parseable": 0.984375,
	"step": 58
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 599.0,
	"completions/mean_length": 225.203125,
	"completions/mean_terminated_length": 185.91803278688525,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 29.5,
	"grad_norm": 7.844467472750694e-05,
	"kl": 0.007104447286110371,
	"learning_rate": 2.993836378613278e-05,
	"loss": 0.0006,
	"num_tokens": 2172568.0,
	"reward": 0.6556448936462402,
	"reward_std": 0.04675156623125076,
	"rewards/reward_matching": 0.5574519634246826,
	"rewards/reward_object_count": 0.6058685183525085,
	"rewards/reward_parseable": 1.0,
	"step": 59
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 995.0,
	"completions/mean_length": 278.96875,
	"completions/mean_terminated_length": 267.14285714285717,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 30.0,
	"grad_norm": 0.00016339456487912685,
	"kl": 0.007064116362016648,
	"learning_rate": 2.993388587651412e-05,
	"loss": -0.0004,
	"num_tokens": 2209366.0,
	"reward": 0.5095100402832031,
	"reward_std": 0.0792592391371727,
	"rewards/reward_matching": 0.43418169021606445,
	"rewards/reward_object_count": 0.2762550711631775,
	"rewards/reward_parseable": 0.96875,
	"step": 60
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 468.0,
	"completions/max_terminated_length": 468.0,
	"completions/mean_length": 142.390625,
	"completions/mean_terminated_length": 142.390625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 30.5,
	"grad_norm": 9.066959319170564e-05,
	"kl": 0.009399229020345956,
	"learning_rate": 2.992925131749921e-05,
	"loss": 0.0001,
	"num_tokens": 2239663.0,
	"reward": 0.7246487140655518,
	"reward_std": 0.050000160932540894,
	"rewards/reward_matching": 0.6537588834762573,
	"rewards/reward_object_count": 0.6619668006896973,
	"rewards/reward_parseable": 1.0,
	"step": 61
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 943.0,
	"completions/mean_length": 273.4375,
	"completions/mean_terminated_length": 249.2258064516129,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 31.0,
	"grad_norm": 0.00016052013961598277,
	"kl": 0.0073067472549155354,
	"learning_rate": 2.9924460157702378e-05,
	"loss": 0.0003,
	"num_tokens": 2274827.0,
	"reward": 0.5953838229179382,
	"reward_std": 0.06788177788257599,
	"rewards/reward_matching": 0.49889495968818665,
	"rewards/reward_object_count": 0.4958592653274536,
	"rewards/reward_parseable": 0.984375,
	"step": 62
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 539.0,
	"completions/max_terminated_length": 539.0,
	"completions/mean_length": 217.921875,
	"completions/mean_terminated_length": 217.921875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 31.5,
	"grad_norm": 8.415069169132039e-05,
	"kl": 0.007433710154145956,
	"learning_rate": 2.991951244738063e-05,
	"loss": 0.0001,
	"num_tokens": 2307398.0,
	"reward": 0.5583871006965637,
	"reward_std": 0.04484350234270096,
	"rewards/reward_matching": 0.45415085554122925,
	"rewards/reward_object_count": 0.4294828772544861,
	"rewards/reward_parseable": 1.0,
	"step": 63
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.078125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 833.0,
	"completions/mean_length": 195.3125,
	"completions/mean_terminated_length": 125.08474576271186,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 32.0,
	"grad_norm": 0.00020136036619078368,
	"kl": 0.009423179202713072,
	"learning_rate": 2.9914408238433095e-05,
	"loss": 0.0003,
	"num_tokens": 2336602.0,
	"reward": 0.6332944631576538,
	"reward_std": 0.060811009258031845,
	"rewards/reward_matching": 0.4973873496055603,
	"rewards/reward_object_count": 0.6743100881576538,
	"rewards/reward_parseable": 1.0,
	"step": 64
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 667.0,
	"completions/mean_length": 221.671875,
	"completions/mean_terminated_length": 195.79032258064515,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 32.5,
	"grad_norm": 0.0001210306872962974,
	"kl": 0.008098234015051275,
	"learning_rate": 2.990914758440052e-05,
	"loss": 0.0,
	"num_tokens": 2368773.0,
	"reward": 0.6689934730529785,
	"reward_std": 0.07700366526842117,
	"rewards/reward_matching": 0.567918062210083,
	"rewards/reward_object_count": 0.6568382382392883,
	"rewards/reward_parseable": 0.984375,
	"step": 65
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.078125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 832.0,
	"completions/mean_length": 289.75,
	"completions/mean_terminated_length": 227.52542372881356,
	"completions/min_length": 15.0,
	"completions/min_terminated_length": 15.0,
	"epoch": 33.0,
	"grad_norm": 0.00016435940051451325,
	"kl": 0.007474837242625654,
	"learning_rate": 2.9903730540464668e-05,
	"loss": 0.0001,
	"num_tokens": 2406901.0,
	"reward": 0.5091193914413452,
	"reward_std": 0.05931903421878815,
	"rewards/reward_matching": 0.43151775002479553,
	"rewards/reward_object_count": 0.2666684687137604,
	"rewards/reward_parseable": 0.984375,
	"step": 66
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 799.0,
	"completions/mean_length": 187.078125,
	"completions/mean_terminated_length": 160.08064516129033,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 33.5,
	"grad_norm": 8.364054519915953e-05,
	"kl": 0.010546724603045732,
	"learning_rate": 2.9898157163447767e-05,
	"loss": 0.0003,
	"num_tokens": 2440058.0,
	"reward": 0.6797939538955688,
	"reward_std": 0.04941866174340248,
	"rewards/reward_matching": 0.5802963376045227,
	"rewards/reward_object_count": 0.6580805778503418,
	"rewards/reward_parseable": 1.0,
	"step": 67
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 964.0,
	"completions/mean_length": 253.125,
	"completions/mean_terminated_length": 215.21311475409837,
	"completions/min_length": 34.0,
	"completions/min_terminated_length": 34.0,
	"epoch": 34.0,
	"grad_norm": 0.0002845938433893025,
	"kl": 0.01157232653349638,
	"learning_rate": 2.9892427511811912e-05,
	"loss": 0.0001,
	"num_tokens": 2474562.0,
	"reward": 0.5502975583076477,
	"reward_std": 0.09778749942779541,
	"rewards/reward_matching": 0.43594658374786377,
	"rewards/reward_object_count": 0.45927298069000244,
	"rewards/reward_parseable": 0.984375,
	"step": 68
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 599.0,
	"completions/mean_length": 215.09375,
	"completions/mean_terminated_length": 189.0,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 34.5,
	"grad_norm": 0.00017123304132837802,
	"kl": 0.010649158153682947,
	"learning_rate": 2.9886541645658435e-05,
	"loss": 0.0003,
	"num_tokens": 2507592.0,
	"reward": 0.6654698848724365,
	"reward_std": 0.05868534743785858,
	"rewards/reward_matching": 0.5466117858886719,
	"rewards/reward_object_count": 0.6875138282775879,
	"rewards/reward_parseable": 1.0,
	"step": 69
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 959.0,
	"completions/mean_length": 212.8125,
	"completions/mean_terminated_length": 158.73333333333332,
	"completions/min_length": 27.0,
	"completions/min_terminated_length": 27.0,
	"epoch": 35.0,
	"grad_norm": 0.001203745836392045,
	"kl": 0.027731457608751953,
	"learning_rate": 2.9880499626727284e-05,
	"loss": -0.0008,
	"num_tokens": 2542396.0,
	"reward": 0.6588045954704285,
	"reward_std": 0.060804709792137146,
	"rewards/reward_matching": 0.5325086116790771,
	"rewards/reward_object_count": 0.7121223211288452,
	"rewards/reward_parseable": 0.984375,
	"step": 70
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 691.0,
	"completions/mean_length": 220.875,
	"completions/mean_terminated_length": 194.96774193548387,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 35.5,
	"grad_norm": 0.00018338189693167806,
	"kl": 0.011799431755207479,
	"learning_rate": 2.9874301518396377e-05,
	"loss": 0.0001,
	"num_tokens": 2578036.0,
	"reward": 0.6314491033554077,
	"reward_std": 0.07501392066478729,
	"rewards/reward_matching": 0.5245179533958435,
	"rewards/reward_object_count": 0.5836920142173767,
	"rewards/reward_parseable": 1.0,
	"step": 71
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 501.0,
	"completions/mean_length": 205.171875,
	"completions/mean_terminated_length": 178.75806451612902,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 36.0,
	"grad_norm": 0.00012583017814904451,
	"kl": 0.010461569239851087,
	"learning_rate": 2.986794738568094e-05,
	"loss": 0.0008,
	"num_tokens": 2608191.0,
	"reward": 0.613935112953186,
	"reward_std": 0.06065426021814346,
	"rewards/reward_matching": 0.49310362339019775,
	"rewards/reward_object_count": 0.5903645753860474,
	"rewards/reward_parseable": 1.0,
	"step": 72
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 767.0,
	"completions/mean_length": 74.953125,
	"completions/mean_terminated_length": 59.888888888888886,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 36.5,
	"grad_norm": 9.753658378031105e-05,
	"kl": 0.018295559682883322,
	"learning_rate": 2.9861437295232825e-05,
	"loss": 0.0001,
	"num_tokens": 2634492.0,
	"reward": 0.7185924649238586,
	"reward_std": 0.06972639262676239,
	"rewards/reward_matching": 0.6025491952896118,
	"rewards/reward_object_count": 0.8009397983551025,
	"rewards/reward_parseable": 0.984375,
	"step": 73
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 658.0,
	"completions/mean_length": 270.625,
	"completions/mean_terminated_length": 233.5737704918033,
	"completions/min_length": 23.0,
	"completions/min_terminated_length": 23.0,
	"epoch": 37.0,
	"grad_norm": 0.0003199022903572768,
	"kl": 0.010636754508595914,
	"learning_rate": 2.9854771315339787e-05,
	"loss": -0.0004,
	"num_tokens": 2672036.0,
	"reward": 0.5783323049545288,
	"reward_std": 0.0985267236828804,
	"rewards/reward_matching": 0.4558379054069519,
	"rewards/reward_object_count": 0.5397727489471436,
	"rewards/reward_parseable": 0.984375,
	"step": 74
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.078125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 843.0,
	"completions/mean_length": 384.5625,
	"completions/mean_terminated_length": 330.3728813559322,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 37.5,
	"grad_norm": 0.00016259767289739102,
	"kl": 0.010080361389555037,
	"learning_rate": 2.984794951592481e-05,
	"loss": -0.0004,
	"num_tokens": 2714632.0,
	"reward": 0.4983637034893036,
	"reward_std": 0.06861061602830887,
	"rewards/reward_matching": 0.423081636428833,
	"rewards/reward_object_count": 0.2538233995437622,
	"rewards/reward_parseable": 0.96875,
	"step": 75
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 105.0,
	"completions/max_terminated_length": 105.0,
	"completions/mean_length": 51.953125,
	"completions/mean_terminated_length": 51.953125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 38.0,
	"grad_norm": 5.508283356903121e-05,
	"kl": 0.020864711608737707,
	"learning_rate": 2.984097196854534e-05,
	"loss": -0.0,
	"num_tokens": 2738501.0,
	"reward": 0.7815486192703247,
	"reward_std": 0.04100874066352844,
	"rewards/reward_matching": 0.6749768853187561,
	"rewards/reward_object_count": 0.8828125,
	"rewards/reward_parseable": 1.0,
	"step": 76
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 709.0,
	"completions/mean_length": 244.484375,
	"completions/mean_terminated_length": 219.33870967741936,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 38.5,
	"grad_norm": 8.971132774604484e-05,
	"kl": 0.009314798924606293,
	"learning_rate": 2.9833838746392544e-05,
	"loss": 0.0004,
	"num_tokens": 2771492.0,
	"reward": 0.6448056697845459,
	"reward_std": 0.047796234488487244,
	"rewards/reward_matching": 0.5422253012657166,
	"rewards/reward_object_count": 0.5973524451255798,
	"rewards/reward_parseable": 1.0,
	"step": 77
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 568.0,
	"completions/mean_length": 236.46875,
	"completions/mean_terminated_length": 183.96666666666667,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 39.0,
	"grad_norm": 0.00015159579925239086,
	"kl": 0.01285445858957246,
	"learning_rate": 2.982654992429056e-05,
	"loss": -0.0002,
	"num_tokens": 2808130.0,
	"reward": 0.6628507971763611,
	"reward_std": 0.061860181391239166,
	"rewards/reward_matching": 0.564445972442627,
	"rewards/reward_object_count": 0.6365411281585693,
	"rewards/reward_parseable": 0.984375,
	"step": 78
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 733.0,
	"completions/mean_length": 224.75,
	"completions/mean_terminated_length": 185.44262295081967,
	"completions/min_length": 70.0,
	"completions/min_terminated_length": 70.0,
	"epoch": 39.5,
	"grad_norm": 0.0001908452104544267,
	"kl": 0.015527774463407695,
	"learning_rate": 2.981910557869566e-05,
	"loss": -0.0002,
	"num_tokens": 2843378.0,
	"reward": 0.5593058466911316,
	"reward_std": 0.07343290746212006,
	"rewards/reward_matching": 0.48125869035720825,
	"rewards/reward_object_count": 0.3527531325817108,
	"rewards/reward_parseable": 1.0,
	"step": 79
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 600.0,
	"completions/max_terminated_length": 600.0,
	"completions/mean_length": 139.65625,
	"completions/mean_terminated_length": 139.65625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 40.0,
	"grad_norm": 0.00013636126823257655,
	"kl": 0.01637664856389165,
	"learning_rate": 2.981150578769553e-05,
	"loss": 0.0001,
	"num_tokens": 2870300.0,
	"reward": 0.6560741066932678,
	"reward_std": 0.06251361966133118,
	"rewards/reward_matching": 0.5192815065383911,
	"rewards/reward_object_count": 0.7225260734558105,
	"rewards/reward_parseable": 1.0,
	"step": 80
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 197.0,
	"completions/max_terminated_length": 197.0,
	"completions/mean_length": 81.265625,
	"completions/mean_terminated_length": 81.265625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 40.5,
	"grad_norm": 0.00010307910270057619,
	"kl": 0.01742720091715455,
	"learning_rate": 2.980375063100836e-05,
	"loss": -0.0,
	"num_tokens": 2895725.0,
	"reward": 0.7209770083427429,
	"reward_std": 0.04894069582223892,
	"rewards/reward_matching": 0.6191629767417908,
	"rewards/reward_object_count": 0.7473958730697632,
	"rewards/reward_parseable": 1.0,
	"step": 81
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 964.0,
	"completions/mean_length": 190.921875,
	"completions/mean_terminated_length": 177.6984126984127,
	"completions/min_length": 38.0,
	"completions/min_terminated_length": 38.0,
	"epoch": 41.0,
	"grad_norm": 6.521846080431715e-05,
	"kl": 0.01442325720563531,
	"learning_rate": 2.979584018998209e-05,
	"loss": 0.0001,
	"num_tokens": 2928488.0,
	"reward": 0.5859934091567993,
	"reward_std": 0.032855767756700516,
	"rewards/reward_matching": 0.503061056137085,
	"rewards/reward_object_count": 0.42078372836112976,
	"rewards/reward_parseable": 1.0,
	"step": 82
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.09375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 927.0,
	"completions/mean_length": 206.546875,
	"completions/mean_terminated_length": 121.98275862068965,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 41.5,
	"grad_norm": 0.00013890951231587678,
	"kl": 0.011370213003829122,
	"learning_rate": 2.97877745475935e-05,
	"loss": -0.0001,
	"num_tokens": 2962251.0,
	"reward": 0.6619434356689453,
	"reward_std": 0.05908970534801483,
	"rewards/reward_matching": 0.545287013053894,
	"rewards/reward_object_count": 0.6894807815551758,
	"rewards/reward_parseable": 0.984375,
	"step": 83
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 919.0,
	"completions/mean_length": 195.953125,
	"completions/mean_terminated_length": 169.24193548387098,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 42.0,
	"grad_norm": 0.00014514043868985027,
	"kl": 0.01611044444143772,
	"learning_rate": 2.9779553788447358e-05,
	"loss": -0.0003,
	"num_tokens": 2995016.0,
	"reward": 0.5909014344215393,
	"reward_std": 0.07796993851661682,
	"rewards/reward_matching": 0.5025642514228821,
	"rewards/reward_object_count": 0.4624394476413727,
	"rewards/reward_parseable": 0.984375,
	"step": 84
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 815.0,
	"completions/mean_length": 312.546875,
	"completions/mean_terminated_length": 277.55737704918033,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 42.5,
	"grad_norm": 0.00012988250819034874,
	"kl": 0.012926546158269048,
	"learning_rate": 2.977117799877554e-05,
	"loss": 0.0012,
	"num_tokens": 3033323.0,
	"reward": 0.5925983190536499,
	"reward_std": 0.056041646748781204,
	"rewards/reward_matching": 0.49923017621040344,
	"rewards/reward_object_count": 0.46530094742774963,
	"rewards/reward_parseable": 1.0,
	"step": 85
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 577.0,
	"completions/max_terminated_length": 577.0,
	"completions/mean_length": 187.328125,
	"completions/mean_terminated_length": 187.328125,
	"completions/min_length": 9.0,
	"completions/min_terminated_length": 9.0,
	"epoch": 43.0,
	"grad_norm": 0.00040236019412986934,
	"kl": 0.017144598648883402,
	"learning_rate": 2.9762647266436115e-05,
	"loss": -0.0008,
	"num_tokens": 3063296.0,
	"reward": 0.5428475141525269,
	"reward_std": 0.09201589226722717,
	"rewards/reward_matching": 0.4472660422325134,
	"rewards/reward_object_count": 0.41931426525115967,
	"rewards/reward_parseable": 0.953125,
	"step": 86
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 311.0,
	"completions/max_terminated_length": 311.0,
	"completions/mean_length": 88.671875,
	"completions/mean_terminated_length": 88.671875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 43.5,
	"grad_norm": 8.879049710230902e-05,
	"kl": 0.016023017466068268,
	"learning_rate": 2.9753961680912432e-05,
	"loss": 0.0002,
	"num_tokens": 3085995.0,
	"reward": 0.687000036239624,
	"reward_std": 0.049605756998062134,
	"rewards/reward_matching": 0.5584548711776733,
	"rewards/reward_object_count": 0.7596354484558105,
	"rewards/reward_parseable": 1.0,
	"step": 87
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 729.0,
	"completions/mean_length": 308.359375,
	"completions/mean_terminated_length": 273.1639344262295,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 44.0,
	"grad_norm": 0.00044992516632191837,
	"kl": 0.02378708287142217,
	"learning_rate": 2.9745121333312166e-05,
	"loss": -0.0027,
	"num_tokens": 3123394.0,
	"reward": 0.4918820559978485,
	"reward_std": 0.11219721287488937,
	"rewards/reward_matching": 0.3640925884246826,
	"rewards/reward_object_count": 0.4608825445175171,
	"rewards/reward_parseable": 0.90625,
	"step": 88
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 931.0,
	"completions/mean_length": 241.46875,
	"completions/mean_terminated_length": 189.3,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 44.5,
	"grad_norm": 8.519666153006256e-05,
	"kl": 0.012101836502552032,
	"learning_rate": 2.9736126316366385e-05,
	"loss": 0.0,
	"num_tokens": 3158112.0,
	"reward": 0.6382660865783691,
	"reward_std": 0.04055645316839218,
	"rewards/reward_matching": 0.4989316463470459,
	"rewards/reward_object_count": 0.6945353746414185,
	"rewards/reward_parseable": 1.0,
	"step": 89
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 434.0,
	"completions/mean_length": 164.859375,
	"completions/mean_terminated_length": 151.22222222222223,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 45.0,
	"grad_norm": 0.0005300024640746415,
	"kl": 0.029471338726580143,
	"learning_rate": 2.9726976724428563e-05,
	"loss": -0.0012,
	"num_tokens": 3189527.0,
	"reward": 0.6498422622680664,
	"reward_std": 0.13978248834609985,
	"rewards/reward_matching": 0.5533084869384766,
	"rewards/reward_object_count": 0.6361607313156128,
	"rewards/reward_parseable": 0.953125,
	"step": 90
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 238.0,
	"completions/max_terminated_length": 238.0,
	"completions/mean_length": 119.578125,
	"completions/mean_terminated_length": 119.578125,
	"completions/min_length": 70.0,
	"completions/min_terminated_length": 70.0,
	"epoch": 45.5,
	"grad_norm": 0.00018720145453698933,
	"kl": 0.02450899383984506,
	"learning_rate": 2.9717672653473588e-05,
	"loss": -0.0006,
	"num_tokens": 3217724.0,
	"reward": 0.6467303037643433,
	"reward_std": 0.05769674479961395,
	"rewards/reward_matching": 0.5479917526245117,
	"rewards/reward_object_count": 0.5896763205528259,
	"rewards/reward_parseable": 1.0,
	"step": 91
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 337.0,
	"completions/mean_length": 99.203125,
	"completions/mean_terminated_length": 84.52380952380952,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 46.0,
	"grad_norm": 7.427236414514482e-05,
	"kl": 0.022152320947498083,
	"learning_rate": 2.9708214201096758e-05,
	"loss": 0.0002,
	"num_tokens": 3242377.0,
	"reward": 0.6481877565383911,
	"reward_std": 0.06066766381263733,
	"rewards/reward_matching": 0.4977788031101227,
	"rewards/reward_object_count": 0.7476025223731995,
	"rewards/reward_parseable": 1.0,
	"step": 92
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 729.0,
	"completions/max_terminated_length": 729.0,
	"completions/mean_length": 219.96875,
	"completions/mean_terminated_length": 219.96875,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 46.5,
	"grad_norm": 0.00016481881903018802,
	"kl": 0.016606852994300425,
	"learning_rate": 2.9698601466512767e-05,
	"loss": 0.0002,
	"num_tokens": 3275079.0,
	"reward": 0.5845397710800171,
	"reward_std": 0.04967654123902321,
	"rewards/reward_matching": 0.5262070894241333,
	"rewards/reward_object_count": 0.3440776765346527,
	"rewards/reward_parseable": 1.0,
	"step": 93
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 766.0,
	"completions/mean_length": 309.375,
	"completions/mean_terminated_length": 274.2295081967213,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 47.0,
	"grad_norm": 0.00014561145508196205,
	"kl": 0.014288356876932085,
	"learning_rate": 2.9688834550554647e-05,
	"loss": 0.0009,
	"num_tokens": 3315103.0,
	"reward": 0.6348440647125244,
	"reward_std": 0.07563169300556183,
	"rewards/reward_matching": 0.5213068723678589,
	"rewards/reward_object_count": 0.6102994084358215,
	"rewards/reward_parseable": 1.0,
	"step": 94
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 425.0,
	"completions/max_terminated_length": 425.0,
	"completions/mean_length": 139.234375,
	"completions/mean_terminated_length": 139.234375,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 47.5,
	"grad_norm": 0.00018322512914892286,
	"kl": 0.023316799197345972,
	"learning_rate": 2.9678913555672733e-05,
	"loss": -0.0007,
	"num_tokens": 3343918.0,
	"reward": 0.6510435342788696,
	"reward_std": 0.09565050899982452,
	"rewards/reward_matching": 0.5266523361206055,
	"rewards/reward_object_count": 0.6908854246139526,
	"rewards/reward_parseable": 0.984375,
	"step": 95
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 402.0,
	"completions/mean_length": 134.640625,
	"completions/mean_terminated_length": 120.52380952380952,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 48.0,
	"grad_norm": 0.00011491947225295007,
	"kl": 0.021518109366297722,
	"learning_rate": 2.966883858593356e-05,
	"loss": 0.0005,
	"num_tokens": 3373719.0,
	"reward": 0.7392737865447998,
	"reward_std": 0.04068930447101593,
	"rewards/reward_matching": 0.6502777338027954,
	"rewards/reward_object_count": 0.7455357313156128,
	"rewards/reward_parseable": 1.0,
	"step": 96
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 197.0,
	"completions/max_terminated_length": 197.0,
	"completions/mean_length": 80.046875,
	"completions/mean_terminated_length": 80.046875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 48.5,
	"grad_norm": 9.114377462537959e-05,
	"kl": 0.03106052055954933,
	"learning_rate": 2.9658609747018796e-05,
	"loss": 0.0,
	"num_tokens": 3400666.0,
	"reward": 0.7638974785804749,
	"reward_std": 0.036661915481090546,
	"rewards/reward_matching": 0.6776763200759888,
	"rewards/reward_object_count": 0.7864583730697632,
	"rewards/reward_parseable": 1.0,
	"step": 97
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 760.0,
	"completions/max_terminated_length": 760.0,
	"completions/mean_length": 195.453125,
	"completions/mean_terminated_length": 195.453125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 49.0,
	"grad_norm": 8.127772161969915e-05,
	"kl": 0.017332423012703657,
	"learning_rate": 2.964822714622412e-05,
	"loss": 0.0007,
	"num_tokens": 3430519.0,
	"reward": 0.6093304753303528,
	"reward_std": 0.046209633350372314,
	"rewards/reward_matching": 0.49052512645721436,
	"rewards/reward_object_count": 0.5750769972801208,
	"rewards/reward_parseable": 1.0,
	"step": 98
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 832.0,
	"completions/mean_length": 303.984375,
	"completions/mean_terminated_length": 280.758064516129,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 49.5,
	"grad_norm": 0.00037192818126641214,
	"kl": 0.02439494035206735,
	"learning_rate": 2.9637690892458103e-05,
	"loss": -0.0004,
	"num_tokens": 3470198.0,
	"reward": 0.5828637480735779,
	"reward_std": 0.1055147647857666,
	"rewards/reward_matching": 0.4695713520050049,
	"rewards/reward_object_count": 0.53685462474823,
	"rewards/reward_parseable": 0.96875,
	"step": 99
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 539.0,
	"completions/mean_length": 218.09375,
	"completions/mean_terminated_length": 205.3015873015873,
	"completions/min_length": 14.0,
	"completions/min_terminated_length": 14.0,
	"epoch": 50.0,
	"grad_norm": 0.0021957538556307554,
	"kl": 0.06775743188336492,
	"learning_rate": 2.962700109624106e-05,
	"loss": 0.0004,
	"num_tokens": 3501820.0,
	"reward": 0.6071096658706665,
	"reward_std": 0.09001424908638,
	"rewards/reward_matching": 0.4763699769973755,
	"rewards/reward_object_count": 0.6220631003379822,
	"rewards/reward_parseable": 0.984375,
	"step": 100
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 273.0,
	"completions/max_terminated_length": 273.0,
	"completions/mean_length": 96.8125,
	"completions/mean_terminated_length": 96.8125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 50.5,
	"grad_norm": 7.189081952674314e-05,
	"kl": 0.026759767439216375,
	"learning_rate": 2.961615786970389e-05,
	"loss": 0.0001,
	"num_tokens": 3526640.0,
	"reward": 0.6088296175003052,
	"reward_std": 0.042443305253982544,
	"rewards/reward_matching": 0.4899764060974121,
	"rewards/reward_object_count": 0.57421875,
	"rewards/reward_parseable": 1.0,
	"step": 101
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 633.0,
	"completions/mean_length": 274.484375,
	"completions/mean_terminated_length": 250.30645161290323,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 51.0,
	"grad_norm": 0.0001283042220165953,
	"kl": 0.017867632559500635,
	"learning_rate": 2.960516132658692e-05,
	"loss": 0.0008,
	"num_tokens": 3564751.0,
	"reward": 0.6425777673721313,
	"reward_std": 0.05326495319604874,
	"rewards/reward_matching": 0.545784592628479,
	"rewards/reward_object_count": 0.5755347013473511,
	"rewards/reward_parseable": 1.0,
	"step": 102
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 910.0,
	"completions/mean_length": 265.46875,
	"completions/mean_terminated_length": 241.0,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 51.5,
	"grad_norm": 0.0001271759974770248,
	"kl": 0.016812809044495225,
	"learning_rate": 2.9594011582238672e-05,
	"loss": 0.0006,
	"num_tokens": 3601005.0,
	"reward": 0.5850105285644531,
	"reward_std": 0.048566583544015884,
	"rewards/reward_matching": 0.4884982705116272,
	"rewards/reward_object_count": 0.4595579504966736,
	"rewards/reward_parseable": 1.0,
	"step": 103
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 957.0,
	"completions/mean_length": 153.640625,
	"completions/mean_terminated_length": 110.8360655737705,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 52.0,
	"grad_norm": 9.4698800239712e-05,
	"kl": 0.02293774695135653,
	"learning_rate": 2.95827087536147e-05,
	"loss": 0.0006,
	"num_tokens": 3632342.0,
	"reward": 0.7446116209030151,
	"reward_std": 0.04331940785050392,
	"rewards/reward_matching": 0.6290016770362854,
	"rewards/reward_object_count": 0.8360530138015747,
	"rewards/reward_parseable": 1.0,
	"step": 104
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 615.0,
	"completions/mean_length": 183.75,
	"completions/mean_terminated_length": 156.6451612903226,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 52.5,
	"grad_norm": 0.00010602272232063115,
	"kl": 0.021552881691604853,
	"learning_rate": 2.9571252959276313e-05,
	"loss": 0.0006,
	"num_tokens": 3662726.0,
	"reward": 0.579430341720581,
	"reward_std": 0.05669737607240677,
	"rewards/reward_matching": 0.4479978680610657,
	"rewards/reward_object_count": 0.5531580448150635,
	"rewards/reward_parseable": 1.0,
	"step": 105
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 273.0,
	"completions/max_terminated_length": 273.0,
	"completions/mean_length": 74.1875,
	"completions/mean_terminated_length": 74.1875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 53.0,
	"grad_norm": 0.00010021215712185949,
	"kl": 0.0273100093472749,
	"learning_rate": 2.955964431938939e-05,
	"loss": 0.0001,
	"num_tokens": 3688018.0,
	"reward": 0.7427883148193359,
	"reward_std": 0.05413203686475754,
	"rewards/reward_matching": 0.6073381900787354,
	"rewards/reward_object_count": 0.8919271230697632,
	"rewards/reward_parseable": 1.0,
	"step": 106
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 311.0,
	"completions/max_terminated_length": 311.0,
	"completions/mean_length": 120.453125,
	"completions/mean_terminated_length": 120.453125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 53.5,
	"grad_norm": 0.00012529945524875075,
	"kl": 0.028699786867946386,
	"learning_rate": 2.9547882955723052e-05,
	"loss": 0.0002,
	"num_tokens": 3713391.0,
	"reward": 0.6109194755554199,
	"reward_std": 0.055304668843746185,
	"rewards/reward_matching": 0.5087559223175049,
	"rewards/reward_object_count": 0.528329610824585,
	"rewards/reward_parseable": 1.0,
	"step": 107
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 502.0,
	"completions/mean_length": 183.0625,
	"completions/mean_terminated_length": 127.0,
	"completions/min_length": 22.0,
	"completions/min_terminated_length": 22.0,
	"epoch": 54.0,
	"grad_norm": 0.00030532677192240953,
	"kl": 0.03329086292069405,
	"learning_rate": 2.953596899164846e-05,
	"loss": -0.0005,
	"num_tokens": 3745331.0,
	"reward": 0.6400688886642456,
	"reward_std": 0.07565727084875107,
	"rewards/reward_matching": 0.5241180658340454,
	"rewards/reward_object_count": 0.6592400670051575,
	"rewards/reward_parseable": 0.96875,
	"step": 108
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 943.0,
	"completions/max_terminated_length": 943.0,
	"completions/mean_length": 246.90625,
	"completions/mean_terminated_length": 246.90625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 54.5,
	"grad_norm": 0.00014556830865330994,
	"kl": 0.01744238520041108,
	"learning_rate": 2.9523902552137436e-05,
	"loss": 0.0005,
	"num_tokens": 3781357.0,
	"reward": 0.6747822761535645,
	"reward_std": 0.046912893652915955,
	"rewards/reward_matching": 0.6312285661697388,
	"rewards/reward_object_count": 0.48022598028182983,
	"rewards/reward_parseable": 1.0,
	"step": 109
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 898.0,
	"completions/mean_length": 231.8125,
	"completions/mean_terminated_length": 219.23809523809524,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 55.0,
	"grad_norm": 0.0007359112496487796,
	"kl": 0.03722105058841407,
	"learning_rate": 2.951168376376124e-05,
	"loss": -0.0018,
	"num_tokens": 3814177.0,
	"reward": 0.5341554880142212,
	"reward_std": 0.12695415318012238,
	"rewards/reward_matching": 0.3953123688697815,
	"rewards/reward_object_count": 0.5473405718803406,
	"rewards/reward_parseable": 0.9375,
	"step": 110
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 767.0,
	"completions/mean_length": 190.8125,
	"completions/mean_terminated_length": 163.93548387096774,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 55.5,
	"grad_norm": 0.00014860746159683913,
	"kl": 0.020320012234151363,
	"learning_rate": 2.9499312754689168e-05,
	"loss": -0.0,
	"num_tokens": 3846933.0,
	"reward": 0.6570301055908203,
	"reward_std": 0.08989942818880081,
	"rewards/reward_matching": 0.5301545858383179,
	"rewards/reward_object_count": 0.7103118300437927,
	"rewards/reward_parseable": 0.984375,
	"step": 111
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 699.0,
	"completions/mean_length": 207.671875,
	"completions/mean_terminated_length": 194.71428571428572,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 56.0,
	"grad_norm": 0.00010286509495927021,
	"kl": 0.01653504034038633,
	"learning_rate": 2.9486789654687256e-05,
	"loss": 0.0004,
	"num_tokens": 3879168.0,
	"reward": 0.6560405492782593,
	"reward_std": 0.05360962823033333,
	"rewards/reward_matching": 0.5268334746360779,
	"rewards/reward_object_count": 0.6997023820877075,
	"rewards/reward_parseable": 1.0,
	"step": 112
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 539.0,
	"completions/mean_length": 260.84375,
	"completions/mean_terminated_length": 236.2258064516129,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 56.5,
	"grad_norm": 0.00011981173156527802,
	"kl": 0.01864371739793569,
	"learning_rate": 2.94741145951169e-05,
	"loss": 0.0002,
	"num_tokens": 3915766.0,
	"reward": 0.5610636472702026,
	"reward_std": 0.04658506065607071,
	"rewards/reward_matching": 0.4559590816497803,
	"rewards/reward_object_count": 0.4374409317970276,
	"rewards/reward_parseable": 1.0,
	"step": 113
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 767.0,
	"completions/mean_length": 225.21875,
	"completions/mean_terminated_length": 199.4516129032258,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 57.0,
	"grad_norm": 0.00016182979743462056,
	"kl": 0.01676144660450518,
	"learning_rate": 2.9461287708933475e-05,
	"loss": 0.0009,
	"num_tokens": 3952004.0,
	"reward": 0.740257740020752,
	"reward_std": 0.05227687209844589,
	"rewards/reward_matching": 0.6612553596496582,
	"rewards/reward_object_count": 0.717522919178009,
	"rewards/reward_parseable": 1.0,
	"step": 114
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 272.0,
	"completions/max_terminated_length": 272.0,
	"completions/mean_length": 95.96875,
	"completions/mean_terminated_length": 95.96875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 57.5,
	"grad_norm": 8.677188452566043e-05,
	"kl": 0.02889580768533051,
	"learning_rate": 2.9448309130684944e-05,
	"loss": -0.0,
	"num_tokens": 3979330.0,
	"reward": 0.7038345336914062,
	"reward_std": 0.03829924017190933,
	"rewards/reward_matching": 0.6301881074905396,
	"rewards/reward_object_count": 0.6286086440086365,
	"rewards/reward_parseable": 1.0,
	"step": 115
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 957.0,
	"completions/max_terminated_length": 957.0,
	"completions/mean_length": 249.09375,
	"completions/mean_terminated_length": 249.09375,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 58.0,
	"grad_norm": 0.00013837260485161096,
	"kl": 0.018972176825627685,
	"learning_rate": 2.9435178996510456e-05,
	"loss": 0.0004,
	"num_tokens": 4015496.0,
	"reward": 0.6081739664077759,
	"reward_std": 0.06578633934259415,
	"rewards/reward_matching": 0.49933409690856934,
	"rewards/reward_object_count": 0.5428677797317505,
	"rewards/reward_parseable": 1.0,
	"step": 116
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 764.0,
	"completions/mean_length": 206.546875,
	"completions/mean_terminated_length": 180.17741935483872,
	"completions/min_length": 20.0,
	"completions/min_terminated_length": 20.0,
	"epoch": 58.5,
	"grad_norm": 0.0002509634068701416,
	"kl": 0.02328762272372842,
	"learning_rate": 2.9421897444138902e-05,
	"loss": -0.0,
	"num_tokens": 4047979.0,
	"reward": 0.6523654460906982,
	"reward_std": 0.09278056025505066,
	"rewards/reward_matching": 0.5031481981277466,
	"rewards/reward_object_count": 0.7680073976516724,
	"rewards/reward_parseable": 0.984375,
	"step": 117
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 577.0,
	"completions/max_terminated_length": 577.0,
	"completions/mean_length": 121.125,
	"completions/mean_terminated_length": 121.125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 59.0,
	"grad_norm": 7.174992060754448e-05,
	"kl": 0.02193958149291575,
	"learning_rate": 2.9408464612887484e-05,
	"loss": 0.0001,
	"num_tokens": 4075955.0,
	"reward": 0.6293376088142395,
	"reward_std": 0.03621161729097366,
	"rewards/reward_matching": 0.5260950922966003,
	"rewards/reward_object_count": 0.5684027671813965,
	"rewards/reward_parseable": 1.0,
	"step": 118
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 964.0,
	"completions/mean_length": 308.25,
	"completions/mean_terminated_length": 260.53333333333336,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 59.5,
	"grad_norm": 0.00014954354264773428,
	"kl": 0.019514269777573645,
	"learning_rate": 2.9394880643660242e-05,
	"loss": -0.0001,
	"num_tokens": 4120067.0,
	"reward": 0.664871096611023,
	"reward_std": 0.0682295560836792,
	"rewards/reward_matching": 0.5886327624320984,
	"rewards/reward_object_count": 0.57408207654953,
	"rewards/reward_parseable": 0.984375,
	"step": 119
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 1000.0,
	"completions/mean_length": 254.0625,
	"completions/mean_terminated_length": 229.2258064516129,
	"completions/min_length": 23.0,
	"completions/min_terminated_length": 23.0,
	"epoch": 60.0,
	"grad_norm": 0.0005555336247198284,
	"kl": 0.06151175429113209,
	"learning_rate": 2.938114567894659e-05,
	"loss": -0.0007,
	"num_tokens": 4153351.0,
	"reward": 0.49984338879585266,
	"reward_std": 0.08151112496852875,
	"rewards/reward_matching": 0.3786693215370178,
	"rewards/reward_object_count": 0.3944591283798218,
	"rewards/reward_parseable": 0.96875,
	"step": 120
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 709.0,
	"completions/mean_length": 302.734375,
	"completions/mean_terminated_length": 279.46774193548384,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 60.5,
	"grad_norm": 0.00010743723396444693,
	"kl": 0.017743419273756444,
	"learning_rate": 2.9367259862819805e-05,
	"loss": 0.0007,
	"num_tokens": 4190070.0,
	"reward": 0.5756763219833374,
	"reward_std": 0.05204359441995621,
	"rewards/reward_matching": 0.43783289194107056,
	"rewards/reward_object_count": 0.5648829936981201,
	"rewards/reward_parseable": 1.0,
	"step": 121
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 159.0,
	"completions/max_terminated_length": 159.0,
	"completions/mean_length": 71.453125,
	"completions/mean_terminated_length": 71.453125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 61.0,
	"grad_norm": 7.059347990434617e-05,
	"kl": 0.02535859332419932,
	"learning_rate": 2.9353223340935533e-05,
	"loss": -0.0001,
	"num_tokens": 4212627.0,
	"reward": 0.6173241138458252,
	"reward_std": 0.045353930443525314,
	"rewards/reward_matching": 0.50152987241745,
	"rewards/reward_object_count": 0.58203125,
	"rewards/reward_parseable": 1.0,
	"step": 122
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 235.0,
	"completions/max_terminated_length": 235.0,
	"completions/mean_length": 105.625,
	"completions/mean_terminated_length": 105.625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 61.5,
	"grad_norm": 7.782715692883357e-05,
	"kl": 0.02937923581339419,
	"learning_rate": 2.933903626053024e-05,
	"loss": -0.0,
	"num_tokens": 4237051.0,
	"reward": 0.6152711510658264,
	"reward_std": 0.04115668684244156,
	"rewards/reward_matching": 0.5210062265396118,
	"rewards/reward_object_count": 0.5133370757102966,
	"rewards/reward_parseable": 1.0,
	"step": 123
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 995.0,
	"completions/mean_length": 237.28125,
	"completions/mean_terminated_length": 198.59016393442624,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 62.0,
	"grad_norm": 0.0003079625021200627,
	"kl": 0.03858275443781167,
	"learning_rate": 2.932469877041969e-05,
	"loss": -0.0022,
	"num_tokens": 4273101.0,
	"reward": 0.5917791128158569,
	"reward_std": 0.07503822445869446,
	"rewards/reward_matching": 0.49250179529190063,
	"rewards/reward_object_count": 0.5282653570175171,
	"rewards/reward_parseable": 0.953125,
	"step": 124
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 943.0,
	"completions/max_terminated_length": 943.0,
	"completions/mean_length": 147.421875,
	"completions/mean_terminated_length": 147.421875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 62.5,
	"grad_norm": 0.0001133783589466475,
	"kl": 0.021977555472403765,
	"learning_rate": 2.931021102099737e-05,
	"loss": 0.0001,
	"num_tokens": 4300520.0,
	"reward": 0.7027174234390259,
	"reward_std": 0.05097541958093643,
	"rewards/reward_matching": 0.5685935616493225,
	"rewards/reward_object_count": 0.8078063130378723,
	"rewards/reward_parseable": 1.0,
	"step": 125
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 273.0,
	"completions/max_terminated_length": 273.0,
	"completions/mean_length": 80.84375,
	"completions/mean_terminated_length": 80.84375,
	"completions/min_length": 12.0,
	"completions/min_terminated_length": 12.0,
	"epoch": 63.0,
	"grad_norm": 9.239943028660491e-05,
	"kl": 0.04109277273528278,
	"learning_rate": 2.9295573164232913e-05,
	"loss": -0.0002,
	"num_tokens": 4322718.0,
	"reward": 0.5727678537368774,
	"reward_std": 0.06367385387420654,
	"rewards/reward_matching": 0.44185274839401245,
	"rewards/reward_object_count": 0.553906261920929,
	"rewards/reward_parseable": 0.984375,
	"step": 126
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 540.0,
	"completions/max_terminated_length": 540.0,
	"completions/mean_length": 165.65625,
	"completions/mean_terminated_length": 165.65625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 63.5,
	"grad_norm": 8.721143240109086e-05,
	"kl": 0.02322767348960042,
	"learning_rate": 2.9280785353670514e-05,
	"loss": 0.0001,
	"num_tokens": 4355464.0,
	"reward": 0.7161321640014648,
	"reward_std": 0.0398261621594429,
	"rewards/reward_matching": 0.6192046403884888,
	"rewards/reward_object_count": 0.7230468988418579,
	"rewards/reward_parseable": 1.0,
	"step": 127
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 121.0,
	"completions/max_terminated_length": 121.0,
	"completions/mean_length": 54.1875,
	"completions/mean_terminated_length": 54.1875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 64.0,
	"grad_norm": 9.180100460071117e-05,
	"kl": 0.030635279836133122,
	"learning_rate": 2.9265847744427305e-05,
	"loss": 0.0,
	"num_tokens": 4377236.0,
	"reward": 0.7174146771430969,
	"reward_std": 0.06405159085988998,
	"rewards/reward_matching": 0.5802397727966309,
	"rewards/reward_object_count": 0.8463541865348816,
	"rewards/reward_parseable": 1.0,
	"step": 128
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 273.0,
	"completions/max_terminated_length": 273.0,
	"completions/mean_length": 132.71875,
	"completions/mean_terminated_length": 132.71875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 64.5,
	"grad_norm": 0.00010723716695792973,
	"kl": 0.024373686173930764,
	"learning_rate": 2.925076049319174e-05,
	"loss": 0.0002,
	"num_tokens": 4405954.0,
	"reward": 0.6944972276687622,
	"reward_std": 0.06675916910171509,
	"rewards/reward_matching": 0.543346107006073,
	"rewards/reward_object_count": 0.8424479365348816,
	"rewards/reward_parseable": 1.0,
	"step": 129
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 667.0,
	"completions/mean_length": 165.234375,
	"completions/mean_terminated_length": 151.6031746031746,
	"completions/min_length": 39.0,
	"completions/min_terminated_length": 39.0,
	"epoch": 65.0,
	"grad_norm": 8.877630898496136e-05,
	"kl": 0.017184904776513577,
	"learning_rate": 2.9235523758221944e-05,
	"loss": 0.0004,
	"num_tokens": 4434193.0,
	"reward": 0.5842474699020386,
	"reward_std": 0.059734977781772614,
	"rewards/reward_matching": 0.47713083028793335,
	"rewards/reward_object_count": 0.4898448884487152,
	"rewards/reward_parseable": 1.0,
	"step": 130
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 615.0,
	"completions/max_terminated_length": 615.0,
	"completions/mean_length": 151.40625,
	"completions/mean_terminated_length": 151.40625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 65.5,
	"grad_norm": 0.00013446353841573,
	"kl": 0.02186008053831756,
	"learning_rate": 2.922013769934406e-05,
	"loss": 0.0002,
	"num_tokens": 4461547.0,
	"reward": 0.610063910484314,
	"reward_std": 0.07120901346206665,
	"rewards/reward_matching": 0.43639126420021057,
	"rewards/reward_object_count": 0.7411458492279053,
	"rewards/reward_parseable": 1.0,
	"step": 131
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 205.0,
	"completions/mean_length": 88.21875,
	"completions/mean_terminated_length": 73.36507936507937,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 66.0,
	"grad_norm": 5.741886707255617e-05,
	"kl": 0.023789451690390706,
	"learning_rate": 2.920460247795056e-05,
	"loss": 0.0002,
	"num_tokens": 4485177.0,
	"reward": 0.6123472452163696,
	"reward_std": 0.04124218225479126,
	"rewards/reward_matching": 0.4969410002231598,
	"rewards/reward_object_count": 0.5709134936332703,
	"rewards/reward_parseable": 1.0,
	"step": 132
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.109375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 799.0,
	"completions/mean_length": 361.5,
	"completions/mean_terminated_length": 280.140350877193,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 66.5,
	"grad_norm": 0.0001969350705621764,
	"kl": 0.031420703046023846,
	"learning_rate": 2.918891825699857e-05,
	"loss": 0.0004,
	"num_tokens": 4527897.0,
	"reward": 0.5840119123458862,
	"reward_std": 0.09337732195854187,
	"rewards/reward_matching": 0.4611853361129761,
	"rewards/reward_object_count": 0.5521284341812134,
	"rewards/reward_parseable": 0.984375,
	"step": 133
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 700.0,
	"completions/mean_length": 173.21875,
	"completions/mean_terminated_length": 159.71428571428572,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 67.0,
	"grad_norm": 0.0017126374877989292,
	"kl": 0.061008882825262845,
	"learning_rate": 2.9173085201008144e-05,
	"loss": 0.0004,
	"num_tokens": 4557607.0,
	"reward": 0.5862646698951721,
	"reward_std": 0.09075936675071716,
	"rewards/reward_matching": 0.46519267559051514,
	"rewards/reward_object_count": 0.5669952630996704,
	"rewards/reward_parseable": 0.96875,
	"step": 134
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 539.0,
	"completions/max_terminated_length": 539.0,
	"completions/mean_length": 131.765625,
	"completions/mean_terminated_length": 131.765625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 67.5,
	"grad_norm": 7.290254143299535e-05,
	"kl": 0.022246332373470068,
	"learning_rate": 2.9157103476060547e-05,
	"loss": 0.0,
	"num_tokens": 4585944.0,
	"reward": 0.7023236155509949,
	"reward_std": 0.04230645298957825,
	"rewards/reward_matching": 0.5999230146408081,
	"rewards/reward_object_count": 0.7118489742279053,
	"rewards/reward_parseable": 1.0,
	"step": 135
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 766.0,
	"completions/mean_length": 188.296875,
	"completions/mean_terminated_length": 175.03174603174602,
	"completions/min_length": 24.0,
	"completions/min_terminated_length": 24.0,
	"epoch": 68.0,
	"grad_norm": 0.0003481461899355054,
	"kl": 0.04150618601124734,
	"learning_rate": 2.914097324979651e-05,
	"loss": -0.0007,
	"num_tokens": 4616619.0,
	"reward": 0.5926013588905334,
	"reward_std": 0.08262215554714203,
	"rewards/reward_matching": 0.4751903712749481,
	"rewards/reward_object_count": 0.56868577003479,
	"rewards/reward_parseable": 0.96875,
	"step": 136
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 463.0,
	"completions/mean_length": 159.109375,
	"completions/mean_terminated_length": 145.38095238095238,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 68.5,
	"grad_norm": 0.0001742025197017938,
	"kl": 0.020869133877567947,
	"learning_rate": 2.9124694691414485e-05,
	"loss": 0.0004,
	"num_tokens": 4648946.0,
	"reward": 0.7437945008277893,
	"reward_std": 0.036315254867076874,
	"rewards/reward_matching": 0.6901907920837402,
	"rewards/reward_object_count": 0.6484003067016602,
	"rewards/reward_parseable": 1.0,
	"step": 137
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 665.0,
	"completions/mean_length": 190.71875,
	"completions/mean_terminated_length": 177.4920634920635,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 69.0,
	"grad_norm": 0.00010955618199659511,
	"kl": 0.015927789616398513,
	"learning_rate": 2.9108267971668828e-05,
	"loss": 0.0006,
	"num_tokens": 4680416.0,
	"reward": 0.5738010406494141,
	"reward_std": 0.057256221771240234,
	"rewards/reward_matching": 0.44618654251098633,
	"rewards/reward_object_count": 0.530445396900177,
	"rewards/reward_parseable": 1.0,
	"step": 138
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 238.0,
	"completions/max_terminated_length": 238.0,
	"completions/mean_length": 101.78125,
	"completions/mean_terminated_length": 101.78125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 69.5,
	"grad_norm": 9.076119749806821e-05,
	"kl": 0.025645660003647208,
	"learning_rate": 2.909169326286807e-05,
	"loss": -0.0002,
	"num_tokens": 4709394.0,
	"reward": 0.7509521245956421,
	"reward_std": 0.05357357859611511,
	"rewards/reward_matching": 0.6316216588020325,
	"rewards/reward_object_count": 0.8598958253860474,
	"rewards/reward_parseable": 1.0,
	"step": 139
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 919.0,
	"completions/max_terminated_length": 919.0,
	"completions/mean_length": 174.90625,
	"completions/mean_terminated_length": 174.90625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 70.0,
	"grad_norm": 6.338445382425562e-05,
	"kl": 0.015905980253592134,
	"learning_rate": 2.9074970738873054e-05,
	"loss": 0.0001,
	"num_tokens": 4738892.0,
	"reward": 0.6288744807243347,
	"reward_std": 0.058405984193086624,
	"rewards/reward_matching": 0.4976961612701416,
	"rewards/reward_object_count": 0.6512840986251831,
	"rewards/reward_parseable": 1.0,
	"step": 140
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 273.0,
	"completions/mean_length": 116.984375,
	"completions/mean_terminated_length": 102.58730158730158,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 70.5,
	"grad_norm": 9.006184700410813e-05,
	"kl": 0.02571859711315483,
	"learning_rate": 2.9058100575095156e-05,
	"loss": 0.0003,
	"num_tokens": 4764363.0,
	"reward": 0.5978801250457764,
	"reward_std": 0.056724559515714645,
	"rewards/reward_matching": 0.49124425649642944,
	"rewards/reward_object_count": 0.5156679153442383,
	"rewards/reward_parseable": 1.0,
	"step": 141
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 994.0,
	"completions/mean_length": 309.84375,
	"completions/mean_terminated_length": 262.23333333333335,
	"completions/min_length": 18.0,
	"completions/min_terminated_length": 18.0,
	"epoch": 71.0,
	"grad_norm": 0.0003815116360783577,
	"kl": 0.029195085866376758,
	"learning_rate": 2.90410829484944e-05,
	"loss": -0.0006,
	"num_tokens": 4806337.0,
	"reward": 0.6275303363800049,
	"reward_std": 0.11912961304187775,
	"rewards/reward_matching": 0.5174634456634521,
	"rewards/reward_object_count": 0.6165112257003784,
	"rewards/reward_parseable": 0.96875,
	"step": 142
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 463.0,
	"completions/max_terminated_length": 463.0,
	"completions/mean_length": 104.703125,
	"completions/mean_terminated_length": 104.703125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 71.5,
	"grad_norm": 9.876031981548294e-05,
	"kl": 0.02460621646605432,
	"learning_rate": 2.902391803757764e-05,
	"loss": 0.0002,
	"num_tokens": 4834542.0,
	"reward": 0.6761475205421448,
	"reward_std": 0.047544464468955994,
	"rewards/reward_matching": 0.5909047722816467,
	"rewards/reward_object_count": 0.6080232858657837,
	"rewards/reward_parseable": 1.0,
	"step": 143
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 501.0,
	"completions/max_terminated_length": 501.0,
	"completions/mean_length": 85.65625,
	"completions/mean_terminated_length": 85.65625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 72.0,
	"grad_norm": 9.556031000101939e-05,
	"kl": 0.02564686187542975,
	"learning_rate": 2.900660602239667e-05,
	"loss": 0.0003,
	"num_tokens": 4857368.0,
	"reward": 0.650336742401123,
	"reward_std": 0.06377670913934708,
	"rewards/reward_matching": 0.5044280290603638,
	"rewards/reward_object_count": 0.7383996248245239,
	"rewards/reward_parseable": 1.0,
	"step": 144
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.078125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 969.0,
	"completions/mean_length": 297.65625,
	"completions/mean_terminated_length": 236.10169491525423,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 72.5,
	"grad_norm": 0.00020726142975036055,
	"kl": 0.026772375334985554,
	"learning_rate": 2.8989147084546335e-05,
	"loss": -0.0007,
	"num_tokens": 4896322.0,
	"reward": 0.6142957210540771,
	"reward_std": 0.06263671815395355,
	"rewards/reward_matching": 0.5525070428848267,
	"rewards/reward_object_count": 0.4295823872089386,
	"rewards/reward_parseable": 0.984375,
	"step": 145
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 501.0,
	"completions/max_terminated_length": 501.0,
	"completions/mean_length": 152.96875,
	"completions/mean_terminated_length": 152.96875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 73.0,
	"grad_norm": 7.541166269220412e-05,
	"kl": 0.018190920585766435,
	"learning_rate": 2.8971541407162637e-05,
	"loss": 0.0002,
	"num_tokens": 4926016.0,
	"reward": 0.6451115608215332,
	"reward_std": 0.041765011847019196,
	"rewards/reward_matching": 0.5400669574737549,
	"rewards/reward_object_count": 0.6053571701049805,
	"rewards/reward_parseable": 1.0,
	"step": 146
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 387.0,
	"completions/mean_length": 177.90625,
	"completions/mean_terminated_length": 150.61290322580646,
	"completions/min_length": 21.0,
	"completions/min_terminated_length": 21.0,
	"epoch": 73.5,
	"grad_norm": 0.00026374394656158984,
	"kl": 0.028221046086400747,
	"learning_rate": 2.8953789174920795e-05,
	"loss": -0.0004,
	"num_tokens": 4958266.0,
	"reward": 0.6425023078918457,
	"reward_std": 0.08397432416677475,
	"rewards/reward_matching": 0.5445351600646973,
	"rewards/reward_object_count": 0.5945312976837158,
	"rewards/reward_parseable": 0.984375,
	"step": 147
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 995.0,
	"completions/mean_length": 166.0625,
	"completions/mean_terminated_length": 152.44444444444446,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 74.0,
	"grad_norm": 0.00026537227677181363,
	"kl": 0.03987942379899323,
	"learning_rate": 2.8935890574033325e-05,
	"loss": -0.0017,
	"num_tokens": 4987198.0,
	"reward": 0.6173149347305298,
	"reward_std": 0.08371478319168091,
	"rewards/reward_matching": 0.48869776725769043,
	"rewards/reward_object_count": 0.667356550693512,
	"rewards/reward_parseable": 0.953125,
	"step": 148
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 237.0,
	"completions/max_terminated_length": 237.0,
	"completions/mean_length": 91.21875,
	"completions/mean_terminated_length": 91.21875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 74.5,
	"grad_norm": 0.00011484589049359784,
	"kl": 0.02938173897564411,
	"learning_rate": 2.8917845792248085e-05,
	"loss": -0.0002,
	"num_tokens": 5014860.0,
	"reward": 0.7492313981056213,
	"reward_std": 0.06530742347240448,
	"rewards/reward_matching": 0.6417745351791382,
	"rewards/reward_object_count": 0.8208333253860474,
	"rewards/reward_parseable": 1.0,
	"step": 149
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 733.0,
	"completions/mean_length": 258.140625,
	"completions/mean_terminated_length": 233.43548387096774,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 75.0,
	"grad_norm": 0.0001241009886143729,
	"kl": 0.014125383459031582,
	"learning_rate": 2.8899655018846297e-05,
	"loss": 0.0007,
	"num_tokens": 5050325.0,
	"reward": 0.6234834790229797,
	"reward_std": 0.05080155283212662,
	"rewards/reward_matching": 0.5106083750724792,
	"rewards/reward_object_count": 0.5855922698974609,
	"rewards/reward_parseable": 1.0,
	"step": 150
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 963.0,
	"completions/mean_length": 316.859375,
	"completions/mean_terminated_length": 269.71666666666664,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 75.5,
	"grad_norm": 0.000219573121285066,
	"kl": 0.019302582019008696,
	"learning_rate": 2.8881318444640564e-05,
	"loss": -0.0001,
	"num_tokens": 5090508.0,
	"reward": 0.6606755256652832,
	"reward_std": 0.08510918915271759,
	"rewards/reward_matching": 0.5664112567901611,
	"rewards/reward_object_count": 0.6197687387466431,
	"rewards/reward_parseable": 0.984375,
	"step": 151
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 970.0,
	"completions/mean_length": 276.375,
	"completions/mean_terminated_length": 252.25806451612902,
	"completions/min_length": 24.0,
	"completions/min_terminated_length": 24.0,
	"epoch": 76.0,
	"grad_norm": 0.00031149154528975487,
	"kl": 0.029386045061983168,
	"learning_rate": 2.8862836261972873e-05,
	"loss": -0.0005,
	"num_tokens": 5125540.0,
	"reward": 0.5237194299697876,
	"reward_std": 0.09430050849914551,
	"rewards/reward_matching": 0.39478474855422974,
	"rewards/reward_object_count": 0.4654930830001831,
	"rewards/reward_parseable": 0.96875,
	"step": 152
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 271.0,
	"completions/max_terminated_length": 271.0,
	"completions/mean_length": 124.234375,
	"completions/mean_terminated_length": 124.234375,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 76.5,
	"grad_norm": 0.00011776048631872982,
	"kl": 0.03456262964755297,
	"learning_rate": 2.8844208664712577e-05,
	"loss": -0.0003,
	"num_tokens": 5151795.0,
	"reward": 0.6158008575439453,
	"reward_std": 0.07029575109481812,
	"rewards/reward_matching": 0.5279592275619507,
	"rewards/reward_object_count": 0.510751485824585,
	"rewards/reward_parseable": 0.984375,
	"step": 153
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 121.0,
	"completions/max_terminated_length": 121.0,
	"completions/mean_length": 54.828125,
	"completions/mean_terminated_length": 54.828125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 77.0,
	"grad_norm": 8.856238127918914e-05,
	"kl": 0.02841056394390762,
	"learning_rate": 2.882543584825435e-05,
	"loss": 0.0,
	"num_tokens": 5175208.0,
	"reward": 0.7693890333175659,
	"reward_std": 0.05110414698719978,
	"rewards/reward_matching": 0.6547108888626099,
	"rewards/reward_object_count": 0.8828125,
	"rewards/reward_parseable": 1.0,
	"step": 154
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 964.0,
	"completions/mean_length": 310.921875,
	"completions/mean_terminated_length": 263.3833333333333,
	"completions/min_length": 18.0,
	"completions/min_terminated_length": 18.0,
	"epoch": 77.5,
	"grad_norm": 0.0002557071566116065,
	"kl": 0.022015991620719433,
	"learning_rate": 2.880651800951616e-05,
	"loss": 0.0002,
	"num_tokens": 5219171.0,
	"reward": 0.6839065551757812,
	"reward_std": 0.06827011704444885,
	"rewards/reward_matching": 0.6086301803588867,
	"rewards/reward_object_count": 0.6092674732208252,
	"rewards/reward_parseable": 0.984375,
	"step": 155
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 964.0,
	"completions/mean_length": 202.609375,
	"completions/mean_terminated_length": 176.11290322580646,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 78.0,
	"grad_norm": 8.753160363994539e-05,
	"kl": 0.018119822721928358,
	"learning_rate": 2.8787455346937182e-05,
	"loss": 0.0002,
	"num_tokens": 5250442.0,
	"reward": 0.6155011653900146,
	"reward_std": 0.04511785879731178,
	"rewards/reward_matching": 0.4806468188762665,
	"rewards/reward_object_count": 0.6355655193328857,
	"rewards/reward_parseable": 1.0,
	"step": 156
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 997.0,
	"completions/mean_length": 287.390625,
	"completions/mean_terminated_length": 263.6290322580645,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 78.5,
	"grad_norm": 0.00010502748773433268,
	"kl": 0.015278441365808249,
	"learning_rate": 2.876824806047573e-05,
	"loss": 0.0005,
	"num_tokens": 5288099.0,
	"reward": 0.5914611220359802,
	"reward_std": 0.04948745667934418,
	"rewards/reward_matching": 0.48842769861221313,
	"rewards/reward_object_count": 0.4920225143432617,
	"rewards/reward_parseable": 1.0,
	"step": 157
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 577.0,
	"completions/max_terminated_length": 577.0,
	"completions/mean_length": 222.75,
	"completions/mean_terminated_length": 222.75,
	"completions/min_length": 25.0,
	"completions/min_terminated_length": 25.0,
	"epoch": 79.0,
	"grad_norm": 0.0001686933246674016,
	"kl": 0.028436586260795593,
	"learning_rate": 2.8748896351607145e-05,
	"loss": -0.0001,
	"num_tokens": 5320979.0,
	"reward": 0.6137920618057251,
	"reward_std": 0.08775545656681061,
	"rewards/reward_matching": 0.5258926153182983,
	"rewards/reward_object_count": 0.5069072246551514,
	"rewards/reward_parseable": 0.984375,
	"step": 158
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.078125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 1021.0,
	"completions/mean_length": 389.984375,
	"completions/mean_terminated_length": 336.2542372881356,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 79.5,
	"grad_norm": 0.00013628082524519414,
	"kl": 0.013047700515016913,
	"learning_rate": 2.8729400423321693e-05,
	"loss": 0.0013,
	"num_tokens": 5362642.0,
	"reward": 0.5530112385749817,
	"reward_std": 0.069613516330719,
	"rewards/reward_matching": 0.4320271611213684,
	"rewards/reward_object_count": 0.4689747095108032,
	"rewards/reward_parseable": 1.0,
	"step": 159
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 205.0,
	"completions/mean_length": 64.96875,
	"completions/mean_terminated_length": 49.74603174603175,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 80.0,
	"grad_norm": 0.00011326887761242688,
	"kl": 0.03361499134916812,
	"learning_rate": 2.8709760480122443e-05,
	"loss": 0.0001,
	"num_tokens": 5388944.0,
	"reward": 0.7502645254135132,
	"reward_std": 0.04951602220535278,
	"rewards/reward_matching": 0.6452133059501648,
	"rewards/reward_object_count": 0.8156828880310059,
	"rewards/reward_parseable": 1.0,
	"step": 160
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 273.0,
	"completions/mean_length": 170.109375,
	"completions/mean_terminated_length": 156.55555555555554,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 80.5,
	"grad_norm": 0.00018552214896772057,
	"kl": 0.02882540924474597,
	"learning_rate": 2.8689976728023103e-05,
	"loss": 0.0002,
	"num_tokens": 5421015.0,
	"reward": 0.6651661992073059,
	"reward_std": 0.072138212621212,
	"rewards/reward_matching": 0.5593208074569702,
	"rewards/reward_object_count": 0.6478685140609741,
	"rewards/reward_parseable": 1.0,
	"step": 161
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 501.0,
	"completions/max_terminated_length": 501.0,
	"completions/mean_length": 183.34375,
	"completions/mean_terminated_length": 183.34375,
	"completions/min_length": 24.0,
	"completions/min_terminated_length": 24.0,
	"epoch": 81.0,
	"grad_norm": 0.00038360359030775726,
	"kl": 0.06057113886345178,
	"learning_rate": 2.8670049374545873e-05,
	"loss": -0.0005,
	"num_tokens": 5452013.0,
	"reward": 0.6705402135848999,
	"reward_std": 0.06918413937091827,
	"rewards/reward_matching": 0.5548437833786011,
	"rewards/reward_object_count": 0.7037945985794067,
	"rewards/reward_parseable": 0.984375,
	"step": 162
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 995.0,
	"completions/mean_length": 385.5,
	"completions/mean_terminated_length": 364.9032258064516,
	"completions/min_length": 83.0,
	"completions/min_terminated_length": 83.0,
	"epoch": 81.5,
	"grad_norm": 0.00013938083429820836,
	"kl": 0.012367542018182576,
	"learning_rate": 2.8649978628719256e-05,
	"loss": 0.0008,
	"num_tokens": 5495309.0,
	"reward": 0.5702022314071655,
	"reward_std": 0.05095814913511276,
	"rewards/reward_matching": 0.48750579357147217,
	"rewards/reward_object_count": 0.38849371671676636,
	"rewards/reward_parseable": 1.0,
	"step": 163
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 425.0,
	"completions/max_terminated_length": 425.0,
	"completions/mean_length": 141.015625,
	"completions/mean_terminated_length": 141.015625,
	"completions/min_length": 9.0,
	"completions/min_terminated_length": 9.0,
	"epoch": 82.0,
	"grad_norm": 8.968032489065081e-05,
	"kl": 0.044206105871126056,
	"learning_rate": 2.8629764701075885e-05,
	"loss": -0.0002,
	"num_tokens": 5521998.0,
	"reward": 0.6258660554885864,
	"reward_std": 0.07497484982013702,
	"rewards/reward_matching": 0.47830966114997864,
	"rewards/reward_object_count": 0.7100260257720947,
	"rewards/reward_parseable": 0.984375,
	"step": 164
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.09375,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 1021.0,
	"completions/mean_length": 354.9375,
	"completions/mean_terminated_length": 285.7241379310345,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 82.5,
	"grad_norm": 0.00014277624723035842,
	"kl": 0.012639820342883468,
	"learning_rate": 2.8609407803650295e-05,
	"loss": 0.0009,
	"num_tokens": 5564298.0,
	"reward": 0.604388952255249,
	"reward_std": 0.036000728607177734,
	"rewards/reward_matching": 0.5304588079452515,
	"rewards/reward_object_count": 0.43056821823120117,
	"rewards/reward_parseable": 1.0,
	"step": 165
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 633.0,
	"completions/mean_length": 265.09375,
	"completions/mean_terminated_length": 240.61290322580646,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 83.0,
	"grad_norm": 0.0005736637976951897,
	"kl": 0.04679834772832692,
	"learning_rate": 2.8588908149976702e-05,
	"loss": -0.0014,
	"num_tokens": 5598928.0,
	"reward": 0.5598403215408325,
	"reward_std": 0.11772333085536957,
	"rewards/reward_matching": 0.47532498836517334,
	"rewards/reward_object_count": 0.43572670221328735,
	"rewards/reward_parseable": 0.9375,
	"step": 166
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 997.0,
	"completions/mean_length": 235.6875,
	"completions/mean_terminated_length": 223.17460317460316,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 83.5,
	"grad_norm": 0.00011323333455948159,
	"kl": 0.02139199187513441,
	"learning_rate": 2.856826595508678e-05,
	"loss": 0.0003,
	"num_tokens": 5636156.0,
	"reward": 0.6084253191947937,
	"reward_std": 0.04125886410474777,
	"rewards/reward_matching": 0.48200953006744385,
	"rewards/reward_object_count": 0.5960979461669922,
	"rewards/reward_parseable": 1.0,
	"step": 167
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 957.0,
	"completions/mean_length": 277.84375,
	"completions/mean_terminated_length": 241.14754098360655,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 84.0,
	"grad_norm": 0.00011925779836019501,
	"kl": 0.013627393753267825,
	"learning_rate": 2.8547481435507382e-05,
	"loss": 0.0012,
	"num_tokens": 5674162.0,
	"reward": 0.6911174654960632,
	"reward_std": 0.04913552850484848,
	"rewards/reward_matching": 0.5930126905441284,
	"rewards/reward_object_count": 0.6765491366386414,
	"rewards/reward_parseable": 1.0,
	"step": 168
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 539.0,
	"completions/max_terminated_length": 539.0,
	"completions/mean_length": 159.125,
	"completions/mean_terminated_length": 159.125,
	"completions/min_length": 12.0,
	"completions/min_terminated_length": 12.0,
	"epoch": 84.5,
	"grad_norm": 0.00013656335067935288,
	"kl": 0.03119825676549226,
	"learning_rate": 2.852655480925828e-05,
	"loss": 0.0001,
	"num_tokens": 5702330.0,
	"reward": 0.5503749251365662,
	"reward_std": 0.07836627960205078,
	"rewards/reward_matching": 0.3975270688533783,
	"rewards/reward_object_count": 0.5749184489250183,
	"rewards/reward_parseable": 0.984375,
	"step": 169
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 865.0,
	"completions/mean_length": 285.15625,
	"completions/mean_terminated_length": 235.9,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 85.0,
	"grad_norm": 0.0001269476197194308,
	"kl": 0.015716996625997126,
	"learning_rate": 2.8505486295849884e-05,
	"loss": 0.0009,
	"num_tokens": 5737924.0,
	"reward": 0.5956513285636902,
	"reward_std": 0.05721241980791092,
	"rewards/reward_matching": 0.495978981256485,
	"rewards/reward_object_count": 0.49031955003738403,
	"rewards/reward_parseable": 1.0,
	"step": 170
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 235.0,
	"completions/mean_length": 167.46875,
	"completions/mean_terminated_length": 125.34426229508196,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 85.5,
	"grad_norm": 0.00018915600958280265,
	"kl": 0.032580646337009966,
	"learning_rate": 2.848427611628093e-05,
	"loss": 0.0005,
	"num_tokens": 5766946.0,
	"reward": 0.5924456119537354,
	"reward_std": 0.06654588133096695,
	"rewards/reward_matching": 0.5018090009689331,
	"rewards/reward_object_count": 0.4724262058734894,
	"rewards/reward_parseable": 0.984375,
	"step": 171
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 997.0,
	"completions/mean_length": 231.484375,
	"completions/mean_terminated_length": 192.50819672131146,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 86.0,
	"grad_norm": 0.00022493403230328113,
	"kl": 0.02257319202180952,
	"learning_rate": 2.8462924493036168e-05,
	"loss": 0.0003,
	"num_tokens": 5806145.0,
	"reward": 0.7301434278488159,
	"reward_std": 0.07324472069740295,
	"rewards/reward_matching": 0.6351655721664429,
	"rewards/reward_object_count": 0.7608456611633301,
	"rewards/reward_parseable": 0.984375,
	"step": 172
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 971.0,
	"completions/max_terminated_length": 971.0,
	"completions/mean_length": 316.921875,
	"completions/mean_terminated_length": 316.921875,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 86.5,
	"grad_norm": 0.00020160213171038777,
	"kl": 0.018056653905659914,
	"learning_rate": 2.8441431650084018e-05,
	"loss": 0.001,
	"num_tokens": 5843452.0,
	"reward": 0.565083920955658,
	"reward_std": 0.04591123014688492,
	"rewards/reward_matching": 0.5012180805206299,
	"rewards/reward_object_count": 0.3217654228210449,
	"rewards/reward_parseable": 1.0,
	"step": 173
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 668.0,
	"completions/max_terminated_length": 668.0,
	"completions/mean_length": 132.734375,
	"completions/mean_terminated_length": 132.734375,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 87.0,
	"grad_norm": 9.513698751106858e-05,
	"kl": 0.019756762427277863,
	"learning_rate": 2.841979781287424e-05,
	"loss": 0.0001,
	"num_tokens": 5874091.0,
	"reward": 0.721807599067688,
	"reward_std": 0.030880732461810112,
	"rewards/reward_matching": 0.632381796836853,
	"rewards/reward_object_count": 0.7118923664093018,
	"rewards/reward_parseable": 1.0,
	"step": 174
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 709.0,
	"completions/max_terminated_length": 709.0,
	"completions/mean_length": 249.890625,
	"completions/mean_terminated_length": 249.890625,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 87.5,
	"grad_norm": 0.0001513262395747006,
	"kl": 0.022078259498812258,
	"learning_rate": 2.8398023208335537e-05,
	"loss": -0.0007,
	"num_tokens": 5907748.0,
	"reward": 0.5269720554351807,
	"reward_std": 0.05287637189030647,
	"rewards/reward_matching": 0.398820161819458,
	"rewards/reward_object_count": 0.45402464270591736,
	"rewards/reward_parseable": 0.984375,
	"step": 175
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 895.0,
	"completions/mean_length": 298.640625,
	"completions/mean_terminated_length": 262.9672131147541,
	"completions/min_length": 12.0,
	"completions/min_terminated_length": 12.0,
	"epoch": 88.0,
	"grad_norm": 0.00025134760653600097,
	"kl": 0.04158466309309006,
	"learning_rate": 2.8376108064873216e-05,
	"loss": -0.0008,
	"num_tokens": 5945805.0,
	"reward": 0.543440043926239,
	"reward_std": 0.06771589815616608,
	"rewards/reward_matching": 0.4773101806640625,
	"rewards/reward_object_count": 0.316519558429718,
	"rewards/reward_parseable": 0.96875,
	"step": 176
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 700.0,
	"completions/mean_length": 252.25,
	"completions/mean_terminated_length": 240.0,
	"completions/min_length": 9.0,
	"completions/min_terminated_length": 9.0,
	"epoch": 88.5,
	"grad_norm": 0.00017161465075332671,
	"kl": 0.03505228122230619,
	"learning_rate": 2.835405261236676e-05,
	"loss": -0.0004,
	"num_tokens": 5978333.0,
	"reward": 0.6240804195404053,
	"reward_std": 0.06789788603782654,
	"rewards/reward_matching": 0.5318564772605896,
	"rewards/reward_object_count": 0.5404576063156128,
	"rewards/reward_parseable": 0.984375,
	"step": 177
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 601.0,
	"completions/max_terminated_length": 601.0,
	"completions/mean_length": 239.578125,
	"completions/mean_terminated_length": 239.578125,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 89.0,
	"grad_norm": 0.00016913673607632518,
	"kl": 0.02632876578718424,
	"learning_rate": 2.833185708216743e-05,
	"loss": 0.0,
	"num_tokens": 6011650.0,
	"reward": 0.5502422451972961,
	"reward_std": 0.0771588683128357,
	"rewards/reward_matching": 0.41941624879837036,
	"rewards/reward_object_count": 0.5085875988006592,
	"rewards/reward_parseable": 0.984375,
	"step": 178
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 539.0,
	"completions/mean_length": 199.859375,
	"completions/mean_terminated_length": 186.77777777777777,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 89.5,
	"grad_norm": 0.00014499339158646762,
	"kl": 0.024362510768696666,
	"learning_rate": 2.8309521707095835e-05,
	"loss": 0.0003,
	"num_tokens": 6046585.0,
	"reward": 0.6444197297096252,
	"reward_std": 0.056686967611312866,
	"rewards/reward_matching": 0.5137526392936707,
	"rewards/reward_object_count": 0.680840790271759,
	"rewards/reward_parseable": 1.0,
	"step": 179
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 712.0,
	"completions/mean_length": 214.703125,
	"completions/mean_terminated_length": 188.59677419354838,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 90.0,
	"grad_norm": 0.00012196839088574052,
	"kl": 0.024665123783051968,
	"learning_rate": 2.8287046721439487e-05,
	"loss": 0.0004,
	"num_tokens": 6077030.0,
	"reward": 0.5945574045181274,
	"reward_std": 0.06549065560102463,
	"rewards/reward_matching": 0.5034471750259399,
	"rewards/reward_object_count": 0.4624456763267517,
	"rewards/reward_parseable": 1.0,
	"step": 180
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 240.0,
	"completions/max_terminated_length": 240.0,
	"completions/mean_length": 134.484375,
	"completions/mean_terminated_length": 134.484375,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 90.5,
	"grad_norm": 0.00010401565668871626,
	"kl": 0.03254161588847637,
	"learning_rate": 2.8264432360950355e-05,
	"loss": -0.0001,
	"num_tokens": 6106501.0,
	"reward": 0.6277846693992615,
	"reward_std": 0.04529394954442978,
	"rewards/reward_matching": 0.542320966720581,
	"rewards/reward_object_count": 0.5119605660438538,
	"rewards/reward_parseable": 1.0,
	"step": 181
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 700.0,
	"completions/mean_length": 295.390625,
	"completions/mean_terminated_length": 283.8253968253968,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 91.0,
	"grad_norm": 0.00018713607278186828,
	"kl": 0.030495932791382074,
	"learning_rate": 2.8241678862842374e-05,
	"loss": 0.0004,
	"num_tokens": 6143390.0,
	"reward": 0.5975713729858398,
	"reward_std": 0.04703337699174881,
	"rewards/reward_matching": 0.48192787170410156,
	"rewards/reward_object_count": 0.5420730710029602,
	"rewards/reward_parseable": 1.0,
	"step": 182
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 501.0,
	"completions/max_terminated_length": 501.0,
	"completions/mean_length": 193.671875,
	"completions/mean_terminated_length": 193.671875,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 91.5,
	"grad_norm": 0.00020340237824711949,
	"kl": 0.034934017108753324,
	"learning_rate": 2.8218786465788984e-05,
	"loss": -0.0003,
	"num_tokens": 6175689.0,
	"reward": 0.6315833926200867,
	"reward_std": 0.10472890734672546,
	"rewards/reward_matching": 0.49378255009651184,
	"rewards/reward_object_count": 0.6921942830085754,
	"rewards/reward_parseable": 0.984375,
	"step": 183
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.046875,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 1020.0,
	"completions/mean_length": 293.3125,
	"completions/mean_terminated_length": 257.37704918032784,
	"completions/min_length": 19.0,
	"completions/min_terminated_length": 19.0,
	"epoch": 92.0,
	"grad_norm": 0.00030767079442739487,
	"kl": 0.02895939163863659,
	"learning_rate": 2.8195755409920584e-05,
	"loss": 0.0009,
	"num_tokens": 6212765.0,
	"reward": 0.5491479635238647,
	"reward_std": 0.09953216463327408,
	"rewards/reward_matching": 0.4275497496128082,
	"rewards/reward_object_count": 0.49434059858322144,
	"rewards/reward_parseable": 0.96875,
	"step": 184
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 387.0,
	"completions/mean_length": 195.203125,
	"completions/mean_terminated_length": 168.46774193548387,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 92.5,
	"grad_norm": 0.00014441045641433448,
	"kl": 0.028225229121744633,
	"learning_rate": 2.8172585936822056e-05,
	"loss": 0.0005,
	"num_tokens": 6245162.0,
	"reward": 0.6523048877716064,
	"reward_std": 0.06327737867832184,
	"rewards/reward_matching": 0.5074872970581055,
	"rewards/reward_object_count": 0.739062488079071,
	"rewards/reward_parseable": 1.0,
	"step": 185
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 653.0,
	"completions/mean_length": 167.1875,
	"completions/mean_terminated_length": 139.5483870967742,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 93.0,
	"grad_norm": 0.0001068919082172215,
	"kl": 0.039769482566043735,
	"learning_rate": 2.814927828953022e-05,
	"loss": -0.0004,
	"num_tokens": 6273206.0,
	"reward": 0.5778319835662842,
	"reward_std": 0.0708276629447937,
	"rewards/reward_matching": 0.4646483063697815,
	"rewards/reward_object_count": 0.5108397603034973,
	"rewards/reward_parseable": 0.984375,
	"step": 186
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 845.0,
	"completions/mean_length": 151.65625,
	"completions/mean_terminated_length": 137.8095238095238,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 93.5,
	"grad_norm": 0.00015888724010437727,
	"kl": 0.05473130161408335,
	"learning_rate": 2.812583271253125e-05,
	"loss": -0.0007,
	"num_tokens": 6304096.0,
	"reward": 0.6830726861953735,
	"reward_std": 0.061051130294799805,
	"rewards/reward_matching": 0.5923636555671692,
	"rewards/reward_object_count": 0.653897225856781,
	"rewards/reward_parseable": 0.984375,
	"step": 187
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 995.0,
	"completions/mean_length": 263.234375,
	"completions/mean_terminated_length": 238.69354838709677,
	"completions/min_length": 31.0,
	"completions/min_terminated_length": 31.0,
	"epoch": 94.0,
	"grad_norm": 0.00038123532431200147,
	"kl": 0.026985038304701447,
	"learning_rate": 2.8102249451758162e-05,
	"loss": 0.0007,
	"num_tokens": 6340207.0,
	"reward": 0.6431405544281006,
	"reward_std": 0.07958689332008362,
	"rewards/reward_matching": 0.5382211804389954,
	"rewards/reward_object_count": 0.6166639924049377,
	"rewards/reward_parseable": 0.984375,
	"step": 188
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 387.0,
	"completions/max_terminated_length": 387.0,
	"completions/mean_length": 102.609375,
	"completions/mean_terminated_length": 102.609375,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 94.5,
	"grad_norm": 9.083108307095245e-05,
	"kl": 0.03483374323695898,
	"learning_rate": 2.8078528754588207e-05,
	"loss": -0.0001,
	"num_tokens": 6366358.0,
	"reward": 0.6219313144683838,
	"reward_std": 0.039522431790828705,
	"rewards/reward_matching": 0.5477997064590454,
	"rewards/reward_object_count": 0.4662574529647827,
	"rewards/reward_parseable": 1.0,
	"step": 189
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 238.0,
	"completions/max_terminated_length": 238.0,
	"completions/mean_length": 93.3125,
	"completions/mean_terminated_length": 93.3125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 95.0,
	"grad_norm": 9.838482219493017e-05,
	"kl": 0.038898272439837456,
	"learning_rate": 2.805467086984027e-05,
	"loss": -0.0001,
	"num_tokens": 6391914.0,
	"reward": 0.7201097011566162,
	"reward_std": 0.07063695788383484,
	"rewards/reward_matching": 0.5871620178222656,
	"rewards/reward_object_count": 0.839062511920929,
	"rewards/reward_parseable": 1.0,
	"step": 190
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 501.0,
	"completions/max_terminated_length": 501.0,
	"completions/mean_length": 159.984375,
	"completions/mean_terminated_length": 159.984375,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 95.5,
	"grad_norm": 0.0001392089470755309,
	"kl": 0.030272011645138264,
	"learning_rate": 2.803067604777227e-05,
	"loss": 0.0001,
	"num_tokens": 6421417.0,
	"reward": 0.650718092918396,
	"reward_std": 0.06433884799480438,
	"rewards/reward_matching": 0.5119979381561279,
	"rewards/reward_object_count": 0.7175967693328857,
	"rewards/reward_parseable": 1.0,
	"step": 191
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 667.0,
	"completions/mean_length": 149.171875,
	"completions/mean_terminated_length": 135.28571428571428,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 96.0,
	"grad_norm": 0.00027154432609677315,
	"kl": 0.09678576281294227,
	"learning_rate": 2.8006544540078535e-05,
	"loss": -0.0018,
	"num_tokens": 6453108.0,
	"reward": 0.6846904158592224,
	"reward_std": 0.0770767480134964,
	"rewards/reward_matching": 0.6123154759407043,
	"rewards/reward_object_count": 0.633380651473999,
	"rewards/reward_parseable": 0.953125,
	"step": 192
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 767.0,
	"completions/mean_length": 327.015625,
	"completions/mean_terminated_length": 280.55,
	"completions/min_length": 11.0,
	"completions/min_terminated_length": 11.0,
	"epoch": 96.5,
	"grad_norm": 0.0002469752507749945,
	"kl": 0.027959817787632346,
	"learning_rate": 2.798227659988717e-05,
	"loss": 0.0006,
	"num_tokens": 6492981.0,
	"reward": 0.6113128662109375,
	"reward_std": 0.0816509798169136,
	"rewards/reward_matching": 0.5030540823936462,
	"rewards/reward_object_count": 0.5630275011062622,
	"rewards/reward_parseable": 0.984375,
	"step": 193
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 805.0,
	"completions/mean_length": 267.453125,
	"completions/mean_terminated_length": 255.44444444444446,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 97.0,
	"grad_norm": 0.0006099395686760545,
	"kl": 0.06375238881446421,
	"learning_rate": 2.7957872481757377e-05,
	"loss": -0.0017,
	"num_tokens": 6530002.0,
	"reward": 0.5870110988616943,
	"reward_std": 0.06495887041091919,
	"rewards/reward_matching": 0.5122255086898804,
	"rewards/reward_object_count": 0.44525402784347534,
	"rewards/reward_parseable": 0.953125,
	"step": 194
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 931.0,
	"completions/mean_length": 241.21875,
	"completions/mean_terminated_length": 228.79365079365078,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 97.5,
	"grad_norm": 0.00015396032540593296,
	"kl": 0.022518991609103978,
	"learning_rate": 2.793333244167681e-05,
	"loss": 0.0002,
	"num_tokens": 6561504.0,
	"reward": 0.6182751655578613,
	"reward_std": 0.05381970480084419,
	"rewards/reward_matching": 0.5018588304519653,
	"rewards/reward_object_count": 0.5857993960380554,
	"rewards/reward_parseable": 1.0,
	"step": 195
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 238.0,
	"completions/max_terminated_length": 238.0,
	"completions/mean_length": 97.671875,
	"completions/mean_terminated_length": 97.671875,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 98.0,
	"grad_norm": 8.675593562657014e-05,
	"kl": 0.03554528998211026,
	"learning_rate": 2.790865673705888e-05,
	"loss": 0.0,
	"num_tokens": 6590219.0,
	"reward": 0.7543612122535706,
	"reward_std": 0.05268620699644089,
	"rewards/reward_matching": 0.6470256447792053,
	"rewards/reward_object_count": 0.8307291865348816,
	"rewards/reward_parseable": 1.0,
	"step": 196
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 435.0,
	"completions/max_terminated_length": 435.0,
	"completions/mean_length": 158.203125,
	"completions/mean_terminated_length": 158.203125,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 98.5,
	"grad_norm": 0.00011117629037471488,
	"kl": 0.02265286911278963,
	"learning_rate": 2.7883845626740046e-05,
	"loss": -0.0,
	"num_tokens": 6619288.0,
	"reward": 0.6950995326042175,
	"reward_std": 0.04912342131137848,
	"rewards/reward_matching": 0.5725617408752441,
	"rewards/reward_object_count": 0.7578125,
	"rewards/reward_parseable": 1.0,
	"step": 197
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 919.0,
	"completions/mean_length": 261.4375,
	"completions/mean_terminated_length": 249.33333333333334,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 99.0,
	"grad_norm": 0.00015910938964225352,
	"kl": 0.01948182564228773,
	"learning_rate": 2.7858899370977123e-05,
	"loss": 0.0007,
	"num_tokens": 6654324.0,
	"reward": 0.5254322290420532,
	"reward_std": 0.04911264032125473,
	"rewards/reward_matching": 0.4140118956565857,
	"rewards/reward_object_count": 0.3851252794265747,
	"rewards/reward_parseable": 1.0,
	"step": 198
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 1024.0,
	"completions/max_terminated_length": 709.0,
	"completions/mean_length": 307.6875,
	"completions/mean_terminated_length": 284.5806451612903,
	"completions/min_length": 5.0,
	"completions/min_terminated_length": 5.0,
	"epoch": 99.5,
	"grad_norm": 0.00020870369917247444,
	"kl": 0.02964519546367228,
	"learning_rate": 2.783381823144452e-05,
	"loss": -0.0003,
	"num_tokens": 6693920.0,
	"reward": 0.5857189893722534,
	"reward_std": 0.07470154017210007,
	"rewards/reward_matching": 0.5070096254348755,
	"rewards/reward_object_count": 0.4231909513473511,
	"rewards/reward_parseable": 0.984375,
	"step": 199
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 139.0,
	"completions/max_terminated_length": 139.0,
	"completions/mean_length": 58.0625,
	"completions/mean_terminated_length": 58.0625,
	"completions/min_length": 40.0,
	"completions/min_terminated_length": 40.0,
	"epoch": 100.0,
	"grad_norm": 0.00010931400902336463,
	"kl": 0.04551131138578057,
	"learning_rate": 2.780860247123153e-05,
	"loss": -0.0,
	"num_tokens": 6716580.0,
	"reward": 0.6769169569015503,
	"reward_std": 0.06710667908191681,
	"rewards/reward_matching": 0.5183860063552856,
	"rewards/reward_object_count": 0.8294271230697632,
	"rewards/reward_parseable": 1.0,
	"step": 200
	}
	],
	"logging_steps": 1,
	"max_steps": 1000,
	"num_input_tokens_seen": 6716580,
	"num_train_epochs": 500,
	"save_steps": 25,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": true,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}