Instructions to use adraganov/trl_model_step_250 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use adraganov/trl_model_step_250 with PEFT:

from peft import PeftModel
from transformers import AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained("google/gemma-3-12b-it")
model = PeftModel.from_pretrained(base_model, "adraganov/trl_model_step_250")

Transformers

How to use adraganov/trl_model_step_250 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="adraganov/trl_model_step_250")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("adraganov/trl_model_step_250", dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use adraganov/trl_model_step_250 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "adraganov/trl_model_step_250"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "adraganov/trl_model_step_250",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/adraganov/trl_model_step_250

SGLang

How to use adraganov/trl_model_step_250 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "adraganov/trl_model_step_250" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "adraganov/trl_model_step_250",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "adraganov/trl_model_step_250" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "adraganov/trl_model_step_250",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use adraganov/trl_model_step_250 with Docker Model Runner:
```
docker model run hf.co/adraganov/trl_model_step_250
```

trl_model_step_250 / trainer_state.json

adraganov

Upload folder using huggingface_hub

c67b758 verified 9 months ago

raw

history blame contribute delete

245 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 3.571428571428571,
	"eval_steps": 500,
	"global_step": 250,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.060131815262138844,
	"epoch": 0.014285714285714285,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.05771088972687721,
	"kl": 0.0,
	"learning_rate": 5e-05,
	"loss": 0.0,
	"num_tokens": 17832.0,
	"reward": 1.0437500476837158,
	"reward_std": 0.0353553369641304,
	"rewards/oai_reward_function/mean": 0.5218750014901161,
	"rewards/oai_reward_function/std": 0.043879419565200806,
	"step": 1
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06413675658404827,
	"epoch": 0.02857142857142857,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.03477979078888893,
	"kl": 0.0003001746808877215,
	"learning_rate": 4.928571428571429e-05,
	"loss": 0.0,
	"num_tokens": 35712.0,
	"reward": 1.046875,
	"reward_std": 0.028149789199233055,
	"rewards/oai_reward_function/mean": 0.5234375,
	"rewards/oai_reward_function/std": 0.049161311239004135,
	"step": 2
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.052969515323638916,
	"epoch": 0.04285714285714286,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0005888506420888007,
	"kl": 0.0004545010087895207,
	"learning_rate": 4.8571428571428576e-05,
	"loss": 0.0,
	"num_tokens": 53424.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 3
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06199027318507433,
	"epoch": 0.05714285714285714,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.04447643458843231,
	"kl": 0.0005710393161280081,
	"learning_rate": 4.785714285714286e-05,
	"loss": 0.0,
	"num_tokens": 71248.0,
	"reward": 1.2265625,
	"reward_std": 0.004419416189193726,
	"rewards/oai_reward_function/mean": 0.61328125,
	"rewards/oai_reward_function/std": 0.1993926614522934,
	"step": 4
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0633242828771472,
	"epoch": 0.07142857142857142,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.043302807956933975,
	"kl": 0.001818844728404656,
	"learning_rate": 4.714285714285714e-05,
	"loss": 0.0,
	"num_tokens": 89000.0,
	"reward": 1.032812476158142,
	"reward_std": 0.022097092121839523,
	"rewards/oai_reward_function/mean": 0.5164062492549419,
	"rewards/oai_reward_function/std": 0.03570114076137543,
	"step": 5
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06267449539154768,
	"epoch": 0.08571428571428572,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.048733897507190704,
	"kl": 0.0011250173120060936,
	"learning_rate": 4.642857142857143e-05,
	"loss": 0.0,
	"num_tokens": 106816.0,
	"reward": 1.071874976158142,
	"reward_std": 0.03390505909919739,
	"rewards/oai_reward_function/mean": 0.5359375029802322,
	"rewards/oai_reward_function/std": 0.07097747921943665,
	"step": 6
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06247459910809994,
	"epoch": 0.1,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07503627240657806,
	"kl": 0.0016785123152658343,
	"learning_rate": 4.5714285714285716e-05,
	"loss": 0.0,
	"num_tokens": 124592.0,
	"reward": 1.181249976158142,
	"reward_std": 0.06808801740407944,
	"rewards/oai_reward_function/mean": 0.5906250029802322,
	"rewards/oai_reward_function/std": 0.13951963186264038,
	"step": 7
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09173925407230854,
	"epoch": 0.11428571428571428,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.046880681067705154,
	"kl": 0.004017388273496181,
	"learning_rate": 4.5e-05,
	"loss": 0.0,
	"num_tokens": 142368.0,
	"reward": 1.001562476158142,
	"reward_std": 0.004419416189193726,
	"rewards/oai_reward_function/mean": 0.5007812500116415,
	"rewards/oai_reward_function/std": 0.0044194175861775875,
	"step": 8
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07956545054912567,
	"epoch": 0.12857142857142856,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.061871547251939774,
	"kl": 0.00639598595444113,
	"learning_rate": 4.428571428571428e-05,
	"loss": 0.0001,
	"num_tokens": 160160.0,
	"reward": 1.021875023841858,
	"reward_std": 0.052504248917102814,
	"rewards/oai_reward_function/mean": 0.5109375007450581,
	"rewards/oai_reward_function/std": 0.053482551127672195,
	"step": 9
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06956008821725845,
	"epoch": 0.14285714285714285,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06243785470724106,
	"kl": 0.00972771504893899,
	"learning_rate": 4.3571428571428576e-05,
	"loss": 0.0001,
	"num_tokens": 177984.0,
	"reward": 1.2296874523162842,
	"reward_std": 0.01684970036149025,
	"rewards/oai_reward_function/mean": 0.6148437485098839,
	"rewards/oai_reward_function/std": 0.1987723708152771,
	"step": 10
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07967641018331051,
	"epoch": 0.15714285714285714,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.07661325484514236,
	"kl": 0.0069638064596802,
	"learning_rate": 4.2857142857142856e-05,
	"loss": 0.0001,
	"num_tokens": 195896.0,
	"reward": 1.1062500476837158,
	"reward_std": 0.06087504327297211,
	"rewards/oai_reward_function/mean": 0.5531250014901161,
	"rewards/oai_reward_function/std": 0.08584260195493698,
	"step": 11
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0841637123376131,
	"epoch": 0.17142857142857143,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05056445300579071,
	"kl": 0.011949660489335656,
	"learning_rate": 4.214285714285714e-05,
	"loss": 0.0001,
	"num_tokens": 213760.0,
	"reward": 1.131250023841858,
	"reward_std": 0.029124131426215172,
	"rewards/oai_reward_function/mean": 0.5656249970197678,
	"rewards/oai_reward_function/std": 0.11875531077384949,
	"step": 12
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0837175901979208,
	"epoch": 0.18571428571428572,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1177188903093338,
	"kl": 0.01176721346564591,
	"learning_rate": 4.1428571428571437e-05,
	"loss": 0.0001,
	"num_tokens": 231664.0,
	"reward": 1.2421875,
	"reward_std": 0.02758825570344925,
	"rewards/oai_reward_function/mean": 0.62109375,
	"rewards/oai_reward_function/std": 0.1868790090084076,
	"step": 13
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07715502567589283,
	"epoch": 0.2,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0020164160523563623,
	"kl": 0.013234916375949979,
	"learning_rate": 4.0714285714285717e-05,
	"loss": 0.0001,
	"num_tokens": 249528.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 14
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0780396144837141,
	"epoch": 0.21428571428571427,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0018555221613496542,
	"kl": 0.011373426881618798,
	"learning_rate": 4e-05,
	"loss": 0.0001,
	"num_tokens": 267168.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 15
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0733959898352623,
	"epoch": 0.22857142857142856,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.09124160557985306,
	"kl": 0.021819928660988808,
	"learning_rate": 3.928571428571429e-05,
	"loss": 0.0002,
	"num_tokens": 284928.0,
	"reward": 1.0484375953674316,
	"reward_std": 0.05051835626363754,
	"rewards/oai_reward_function/mean": 0.5242187511175871,
	"rewards/oai_reward_function/std": 0.044669199734926224,
	"step": 16
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09740176424384117,
	"epoch": 0.24285714285714285,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.052958983927965164,
	"kl": 0.028434510342776775,
	"learning_rate": 3.857142857142858e-05,
	"loss": 0.0003,
	"num_tokens": 302816.0,
	"reward": 1.071874976158142,
	"reward_std": 0.06469365209341049,
	"rewards/oai_reward_function/mean": 0.5359374992549419,
	"rewards/oai_reward_function/std": 0.0882028192281723,
	"step": 17
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08574963361024857,
	"epoch": 0.2571428571428571,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.04292497783899307,
	"kl": 0.033173230942338705,
	"learning_rate": 3.785714285714286e-05,
	"loss": 0.0003,
	"num_tokens": 320584.0,
	"reward": 1.001562476158142,
	"reward_std": 0.004419416189193726,
	"rewards/oai_reward_function/mean": 0.5007812500116415,
	"rewards/oai_reward_function/std": 0.0044194175861775875,
	"step": 18
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12318380549550056,
	"epoch": 0.2714285714285714,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.06502827256917953,
	"kl": 0.03774468321353197,
	"learning_rate": 3.7142857142857143e-05,
	"loss": 0.0004,
	"num_tokens": 338448.0,
	"reward": 1.109375,
	"reward_std": 0.05164698138833046,
	"rewards/oai_reward_function/mean": 0.5546875,
	"rewards/oai_reward_function/std": 0.10803177952766418,
	"step": 19
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08734610676765442,
	"epoch": 0.2857142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07792048156261444,
	"kl": 0.028081147465854883,
	"learning_rate": 3.642857142857143e-05,
	"loss": 0.0003,
	"num_tokens": 356200.0,
	"reward": 1.03125,
	"reward_std": 0.047612957656383514,
	"rewards/oai_reward_function/mean": 0.515625,
	"rewards/oai_reward_function/std": 0.04151855409145355,
	"step": 20
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08050715737044811,
	"epoch": 0.3,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.047350119799375534,
	"kl": 0.026192680466920137,
	"learning_rate": 3.571428571428572e-05,
	"loss": 0.0003,
	"num_tokens": 373912.0,
	"reward": 0.503125011920929,
	"reward_std": 0.008838832378387451,
	"rewards/oai_reward_function/mean": 0.25156250002328306,
	"rewards/oai_reward_function/std": 0.26283908169716597,
	"step": 21
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07859978079795837,
	"epoch": 0.3142857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.18296176195144653,
	"kl": 0.035550063010305166,
	"learning_rate": 3.5e-05,
	"loss": 0.0004,
	"num_tokens": 391880.0,
	"reward": 0.2578125,
	"reward_std": 0.4363012909889221,
	"rewards/oai_reward_function/mean": 0.12890625,
	"rewards/oai_reward_function/std": 0.28480061888694763,
	"step": 22
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07332467474043369,
	"epoch": 0.32857142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.26302284002304077,
	"kl": 0.02304189372807741,
	"learning_rate": 3.428571428571429e-05,
	"loss": 0.0002,
	"num_tokens": 409592.0,
	"reward": 0.4375,
	"reward_std": 0.3335031569004059,
	"rewards/oai_reward_function/mean": 0.21875,
	"rewards/oai_reward_function/std": 0.2520080506801605,
	"step": 23
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08370361104607582,
	"epoch": 0.34285714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07096434384584427,
	"kl": 0.02303632628172636,
	"learning_rate": 3.357142857142857e-05,
	"loss": 0.0002,
	"num_tokens": 427504.0,
	"reward": 1.0906250476837158,
	"reward_std": 0.12288369983434677,
	"rewards/oai_reward_function/mean": 0.5453124977648258,
	"rewards/oai_reward_function/std": 0.18977738916873932,
	"step": 24
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10516241379082203,
	"epoch": 0.35714285714285715,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.10253780335187912,
	"kl": 0.022893703542649746,
	"learning_rate": 3.285714285714286e-05,
	"loss": 0.0002,
	"num_tokens": 445464.0,
	"reward": 1.0281250476837158,
	"reward_std": 0.11285631358623505,
	"rewards/oai_reward_function/mean": 0.5140625014901161,
	"rewards/oai_reward_function/std": 0.13734418153762817,
	"step": 25
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10455058515071869,
	"epoch": 0.37142857142857144,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0020646003540605307,
	"kl": 0.013847913593053818,
	"learning_rate": 3.2142857142857144e-05,
	"loss": 0.0001,
	"num_tokens": 463176.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 26
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10624882206320763,
	"epoch": 0.38571428571428573,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08771698921918869,
	"kl": 0.023737956769764423,
	"learning_rate": 3.142857142857143e-05,
	"loss": 0.0002,
	"num_tokens": 480896.0,
	"reward": 1.1234374046325684,
	"reward_std": 0.12076057493686676,
	"rewards/oai_reward_function/mean": 0.5617187507450581,
	"rewards/oai_reward_function/std": 0.09692539274692535,
	"step": 27
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09867865778505802,
	"epoch": 0.4,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.04759760946035385,
	"kl": 0.016957666259258986,
	"learning_rate": 3.071428571428572e-05,
	"loss": 0.0002,
	"num_tokens": 498752.0,
	"reward": 1.0515625476837158,
	"reward_std": 0.016952523961663246,
	"rewards/oai_reward_function/mean": 0.525781249627471,
	"rewards/oai_reward_function/std": 0.048144761472940445,
	"step": 28
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12629481963813305,
	"epoch": 0.4142857142857143,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.058567993342876434,
	"kl": 0.017663696315139532,
	"learning_rate": 3e-05,
	"loss": 0.0002,
	"num_tokens": 516552.0,
	"reward": 1.2234375476837158,
	"reward_std": 0.018139135092496872,
	"rewards/oai_reward_function/mean": 0.6117187514901161,
	"rewards/oai_reward_function/std": 0.1933349370956421,
	"step": 29
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.1236942820250988,
	"epoch": 0.42857142857142855,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.08224395662546158,
	"kl": 0.011707060737535357,
	"learning_rate": 2.9285714285714288e-05,
	"loss": 0.0001,
	"num_tokens": 534336.0,
	"reward": 1.1906249523162842,
	"reward_std": 0.09417471289634705,
	"rewards/oai_reward_function/mean": 0.5953124985098839,
	"rewards/oai_reward_function/std": 0.2836897447705269,
	"step": 30
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12007096596062183,
	"epoch": 0.44285714285714284,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.12164021283388138,
	"kl": 0.015199759975075722,
	"learning_rate": 2.857142857142857e-05,
	"loss": 0.0002,
	"num_tokens": 552288.0,
	"reward": 1.459375023841858,
	"reward_std": 0.23513765633106232,
	"rewards/oai_reward_function/mean": 0.729687511920929,
	"rewards/oai_reward_function/std": 0.31374088674783707,
	"step": 31
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12509393319487572,
	"epoch": 0.45714285714285713,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.09083209186792374,
	"kl": 0.01757637900300324,
	"learning_rate": 2.785714285714286e-05,
	"loss": 0.0002,
	"num_tokens": 570160.0,
	"reward": 1.076562523841858,
	"reward_std": 0.04446931555867195,
	"rewards/oai_reward_function/mean": 0.5382812470197678,
	"rewards/oai_reward_function/std": 0.07267481088638306,
	"step": 32
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12463634088635445,
	"epoch": 0.4714285714285714,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.002431818749755621,
	"kl": 0.014958202606067061,
	"learning_rate": 2.714285714285714e-05,
	"loss": 0.0001,
	"num_tokens": 587872.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 33
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11686164513230324,
	"epoch": 0.4857142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.05484098196029663,
	"kl": 0.010952673037536442,
	"learning_rate": 2.642857142857143e-05,
	"loss": 0.0001,
	"num_tokens": 605824.0,
	"reward": 1.0968749523162842,
	"reward_std": 0.09722718596458435,
	"rewards/oai_reward_function/mean": 0.5484375022351742,
	"rewards/oai_reward_function/std": 0.0920066386461258,
	"step": 34
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12041523866355419,
	"epoch": 0.5,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05225397273898125,
	"kl": 0.006640716805122793,
	"learning_rate": 2.5714285714285714e-05,
	"loss": 0.0001,
	"num_tokens": 623624.0,
	"reward": 1.0046875476837158,
	"reward_std": 0.0093002924695611,
	"rewards/oai_reward_function/mean": 0.5023437500931323,
	"rewards/oai_reward_function/std": 0.009753772988915443,
	"step": 35
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12032002210617065,
	"epoch": 0.5142857142857142,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07161174714565277,
	"kl": 0.010428835870698094,
	"learning_rate": 2.5e-05,
	"loss": 0.0001,
	"num_tokens": 641400.0,
	"reward": 1.0437500476837158,
	"reward_std": 0.052891530096530914,
	"rewards/oai_reward_function/mean": 0.521874999627471,
	"rewards/oai_reward_function/std": 0.04741290956735611,
	"step": 36
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.13013662584125996,
	"epoch": 0.5285714285714286,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05804259702563286,
	"kl": 0.01170262903906405,
	"learning_rate": 2.4285714285714288e-05,
	"loss": 0.0001,
	"num_tokens": 659192.0,
	"reward": 1.0812499523162842,
	"reward_std": 0.07288689911365509,
	"rewards/oai_reward_function/mean": 0.5406250022351742,
	"rewards/oai_reward_function/std": 0.09954533725976944,
	"step": 37
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09727449901401997,
	"epoch": 0.5428571428571428,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.07038255035877228,
	"kl": 0.009029814857058227,
	"learning_rate": 2.357142857142857e-05,
	"loss": 0.0001,
	"num_tokens": 677088.0,
	"reward": 1.423437476158142,
	"reward_std": 0.03818885609507561,
	"rewards/oai_reward_function/mean": 0.711718738079071,
	"rewards/oai_reward_function/std": 0.18094559013843536,
	"step": 38
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11586509644985199,
	"epoch": 0.5571428571428572,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08595240861177444,
	"kl": 0.011346436338499188,
	"learning_rate": 2.2857142857142858e-05,
	"loss": 0.0001,
	"num_tokens": 694920.0,
	"reward": 1.3796875476837158,
	"reward_std": 0.049540840089321136,
	"rewards/oai_reward_function/mean": 0.6898437440395355,
	"rewards/oai_reward_function/std": 0.20018735527992249,
	"step": 39
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.1129021979868412,
	"epoch": 0.5714285714285714,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.03896208480000496,
	"kl": 0.011648714076727629,
	"learning_rate": 2.214285714285714e-05,
	"loss": 0.0001,
	"num_tokens": 712560.0,
	"reward": 1.0031249523162842,
	"reward_std": 0.008838832378387451,
	"rewards/oai_reward_function/mean": 0.5015625000232831,
	"rewards/oai_reward_function/std": 0.008838835172355175,
	"step": 40
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12671913765370846,
	"epoch": 0.5857142857142857,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.060481104999780655,
	"kl": 0.009595283307135105,
	"learning_rate": 2.1428571428571428e-05,
	"loss": 0.0001,
	"num_tokens": 730352.0,
	"reward": 1.037500023841858,
	"reward_std": 0.026726119220256805,
	"rewards/oai_reward_function/mean": 0.5187500007450581,
	"rewards/oai_reward_function/std": 0.0416397787630558,
	"step": 41
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.1355144940316677,
	"epoch": 0.6,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.10250900685787201,
	"kl": 0.010706432163715363,
	"learning_rate": 2.0714285714285718e-05,
	"loss": 0.0001,
	"num_tokens": 748080.0,
	"reward": 0.971875011920929,
	"reward_std": 0.16737449169158936,
	"rewards/oai_reward_function/mean": 0.48593750037252903,
	"rewards/oai_reward_function/std": 0.18062228709459305,
	"step": 42
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11447549611330032,
	"epoch": 0.6142857142857143,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07029257714748383,
	"kl": 0.011148489313200116,
	"learning_rate": 2e-05,
	"loss": 0.0001,
	"num_tokens": 765848.0,
	"reward": 1.029687523841858,
	"reward_std": 0.06395581364631653,
	"rewards/oai_reward_function/mean": 0.5148437507450581,
	"rewards/oai_reward_function/std": 0.05420219525694847,
	"step": 43
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12138544581830502,
	"epoch": 0.6285714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07489942759275436,
	"kl": 0.009310122113674879,
	"learning_rate": 1.928571428571429e-05,
	"loss": 0.0001,
	"num_tokens": 783552.0,
	"reward": 1.015625,
	"reward_std": 0.03808925300836563,
	"rewards/oai_reward_function/mean": 0.5078125,
	"rewards/oai_reward_function/std": 0.02870701625943184,
	"step": 44
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11241224221885204,
	"epoch": 0.6428571428571429,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06217681244015694,
	"kl": 0.015002928674221039,
	"learning_rate": 1.8571428571428572e-05,
	"loss": 0.0001,
	"num_tokens": 801392.0,
	"reward": 1.171875,
	"reward_std": 0.12756596505641937,
	"rewards/oai_reward_function/mean": 0.5859375,
	"rewards/oai_reward_function/std": 0.13151375949382782,
	"step": 45
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10430784896016121,
	"epoch": 0.6571428571428571,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.07851167023181915,
	"kl": 0.013715020613744855,
	"learning_rate": 1.785714285714286e-05,
	"loss": 0.0001,
	"num_tokens": 819120.0,
	"reward": 1.1765625476837158,
	"reward_std": 0.11721621453762054,
	"rewards/oai_reward_function/mean": 0.5882812440395355,
	"rewards/oai_reward_function/std": 0.23893966525793076,
	"step": 46
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09174064546823502,
	"epoch": 0.6714285714285714,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08239107578992844,
	"kl": 0.0339348167181015,
	"learning_rate": 1.7142857142857145e-05,
	"loss": 0.0003,
	"num_tokens": 836976.0,
	"reward": 1.1734375953674316,
	"reward_std": 0.07495103776454926,
	"rewards/oai_reward_function/mean": 0.5867187529802322,
	"rewards/oai_reward_function/std": 0.09024705737829208,
	"step": 47
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12333916500210762,
	"epoch": 0.6857142857142857,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.06266991049051285,
	"kl": 0.011174799175933003,
	"learning_rate": 1.642857142857143e-05,
	"loss": 0.0001,
	"num_tokens": 854808.0,
	"reward": 1.021875023841858,
	"reward_std": 0.03390507400035858,
	"rewards/oai_reward_function/mean": 0.5109375007450581,
	"rewards/oai_reward_function/std": 0.03753358870744705,
	"step": 48
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11699695512652397,
	"epoch": 0.7,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06871096044778824,
	"kl": 0.011643779696896672,
	"learning_rate": 1.5714285714285715e-05,
	"loss": 0.0001,
	"num_tokens": 872616.0,
	"reward": 1.2109375,
	"reward_std": 0.020290398970246315,
	"rewards/oai_reward_function/mean": 0.6054687574505806,
	"rewards/oai_reward_function/std": 0.18247121572494507,
	"step": 49
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11293753050267696,
	"epoch": 0.7142857142857143,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.03746737167239189,
	"kl": 0.008202132536098361,
	"learning_rate": 1.5e-05,
	"loss": 0.0001,
	"num_tokens": 890472.0,
	"reward": 1.0281250476837158,
	"reward_std": 0.008838837966322899,
	"rewards/oai_reward_function/mean": 0.514062499627471,
	"rewards/oai_reward_function/std": 0.026133574545383453,
	"step": 50
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.12108040601015091,
	"epoch": 0.7285714285714285,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.09108272194862366,
	"kl": 0.009169791359454393,
	"learning_rate": 1.4285714285714285e-05,
	"loss": 0.0001,
	"num_tokens": 908352.0,
	"reward": 1.126562476158142,
	"reward_std": 0.1380167454481125,
	"rewards/oai_reward_function/mean": 0.5632812455296516,
	"rewards/oai_reward_function/std": 0.1459098607301712,
	"step": 51
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10115997679531574,
	"epoch": 0.7428571428571429,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06731049716472626,
	"kl": 0.007746399496681988,
	"learning_rate": 1.357142857142857e-05,
	"loss": 0.0001,
	"num_tokens": 926032.0,
	"reward": 1.045312523841858,
	"reward_std": 0.04133228585124016,
	"rewards/oai_reward_function/mean": 0.5226562507450581,
	"rewards/oai_reward_function/std": 0.04369957000017166,
	"step": 52
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.114451814442873,
	"epoch": 0.7571428571428571,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0016380356391891837,
	"kl": 0.008339080261066556,
	"learning_rate": 1.2857142857142857e-05,
	"loss": 0.0001,
	"num_tokens": 943784.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 53
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.1146883126348257,
	"epoch": 0.7714285714285715,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.0652991309762001,
	"kl": 0.014742115745320916,
	"learning_rate": 1.2142857142857144e-05,
	"loss": 0.0001,
	"num_tokens": 961592.0,
	"reward": 1.162500023841858,
	"reward_std": 0.09099893271923065,
	"rewards/oai_reward_function/mean": 0.5812500044703484,
	"rewards/oai_reward_function/std": 0.11896733194589615,
	"step": 54
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0963958241045475,
	"epoch": 0.7857142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06369847059249878,
	"kl": 0.00836158636957407,
	"learning_rate": 1.1428571428571429e-05,
	"loss": 0.0001,
	"num_tokens": 979312.0,
	"reward": 1.1375000476837158,
	"reward_std": 0.055009134113788605,
	"rewards/oai_reward_function/mean": 0.5687500014901161,
	"rewards/oai_reward_function/std": 0.12556324899196625,
	"step": 55
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11964921839535236,
	"epoch": 0.8,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.10623525083065033,
	"kl": 0.008312122779898345,
	"learning_rate": 1.0714285714285714e-05,
	"loss": 0.0001,
	"num_tokens": 997072.0,
	"reward": 1.0031249523162842,
	"reward_std": 0.11129148304462433,
	"rewards/oai_reward_function/mean": 0.501562500372529,
	"rewards/oai_reward_function/std": 0.12565238773822784,
	"step": 56
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10028179734945297,
	"epoch": 0.8142857142857143,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05555571988224983,
	"kl": 0.012380573665723205,
	"learning_rate": 1e-05,
	"loss": 0.0001,
	"num_tokens": 1014864.0,
	"reward": 1.015625,
	"reward_std": 0.0265165027230978,
	"rewards/oai_reward_function/mean": 0.5078125,
	"rewards/oai_reward_function/std": 0.02870701625943184,
	"step": 57
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09480222314596176,
	"epoch": 0.8285714285714286,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09733164310455322,
	"kl": 0.010292174993082881,
	"learning_rate": 9.285714285714286e-06,
	"loss": 0.0001,
	"num_tokens": 1032656.0,
	"reward": 1.125,
	"reward_std": 0.10169674456119537,
	"rewards/oai_reward_function/mean": 0.5624999962747097,
	"rewards/oai_reward_function/std": 0.10375995188951492,
	"step": 58
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10845490545034409,
	"epoch": 0.8428571428571429,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07499091327190399,
	"kl": 0.009870404610410333,
	"learning_rate": 8.571428571428573e-06,
	"loss": 0.0001,
	"num_tokens": 1050472.0,
	"reward": 1.0187499523162842,
	"reward_std": 0.02493581920862198,
	"rewards/oai_reward_function/mean": 0.509375000372529,
	"rewards/oai_reward_function/std": 0.019827887415885925,
	"step": 59
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11447742953896523,
	"epoch": 0.8571428571428571,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05116976425051689,
	"kl": 0.005832118098624051,
	"learning_rate": 7.857142857142858e-06,
	"loss": 0.0001,
	"num_tokens": 1068288.0,
	"reward": 1.0343749523162842,
	"reward_std": 0.029693374410271645,
	"rewards/oai_reward_function/mean": 0.517187500372529,
	"rewards/oai_reward_function/std": 0.04136652871966362,
	"step": 60
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10109574533998966,
	"epoch": 0.8714285714285714,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06300117075443268,
	"kl": 0.007976277614943683,
	"learning_rate": 7.142857142857143e-06,
	"loss": 0.0001,
	"num_tokens": 1086200.0,
	"reward": 1.0109374523162842,
	"reward_std": 0.023685520514845848,
	"rewards/oai_reward_function/mean": 0.5054687499068677,
	"rewards/oai_reward_function/std": 0.01765984110534191,
	"step": 61
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10328171029686928,
	"epoch": 0.8857142857142857,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.041226889938116074,
	"kl": 0.00717292504850775,
	"learning_rate": 6.428571428571429e-06,
	"loss": 0.0001,
	"num_tokens": 1104056.0,
	"reward": 1.09375,
	"reward_std": 0.03720119222998619,
	"rewards/oai_reward_function/mean": 0.546875,
	"rewards/oai_reward_function/std": 0.08974651247262955,
	"step": 62
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09696869738399982,
	"epoch": 0.9,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.03586564213037491,
	"kl": 0.009956882800906897,
	"learning_rate": 5.7142857142857145e-06,
	"loss": 0.0001,
	"num_tokens": 1121872.0,
	"reward": 1.1218750476837158,
	"reward_std": 0.031160593032836914,
	"rewards/oai_reward_function/mean": 0.5609375014901161,
	"rewards/oai_reward_function/std": 0.11124978214502335,
	"step": 63
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10472088679671288,
	"epoch": 0.9142857142857143,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.045453496277332306,
	"kl": 0.008746590930968523,
	"learning_rate": 5e-06,
	"loss": 0.0001,
	"num_tokens": 1139616.0,
	"reward": 1.0046875476837158,
	"reward_std": 0.00930030457675457,
	"rewards/oai_reward_function/mean": 0.5023437500931323,
	"rewards/oai_reward_function/std": 0.009753772988915443,
	"step": 64
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09775208681821823,
	"epoch": 0.9285714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07111279666423798,
	"kl": 0.007679712725803256,
	"learning_rate": 4.285714285714286e-06,
	"loss": 0.0001,
	"num_tokens": 1157472.0,
	"reward": 1.0890624523162842,
	"reward_std": 0.04253753647208214,
	"rewards/oai_reward_function/mean": 0.5445312485098839,
	"rewards/oai_reward_function/std": 0.08174862712621689,
	"step": 65
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10566045716404915,
	"epoch": 0.9428571428571428,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09820882230997086,
	"kl": 0.005833235685713589,
	"learning_rate": 3.5714285714285714e-06,
	"loss": 0.0001,
	"num_tokens": 1175344.0,
	"reward": 1.3125,
	"reward_std": 0.0763113722205162,
	"rewards/oai_reward_function/mean": 0.65625,
	"rewards/oai_reward_function/std": 0.199495330452919,
	"step": 66
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09616570547223091,
	"epoch": 0.9571428571428572,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.060254957526922226,
	"kl": 0.005365552264265716,
	"learning_rate": 2.8571428571428573e-06,
	"loss": 0.0001,
	"num_tokens": 1193248.0,
	"reward": 1.0703125,
	"reward_std": 0.026579536497592926,
	"rewards/oai_reward_function/mean": 0.53515625,
	"rewards/oai_reward_function/std": 0.06377232819795609,
	"step": 67
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09799160063266754,
	"epoch": 0.9714285714285714,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0033068626653403044,
	"kl": 0.010790573665872216,
	"learning_rate": 2.142857142857143e-06,
	"loss": 0.0001,
	"num_tokens": 1210992.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 68
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09545023553073406,
	"epoch": 0.9857142857142858,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05336514860391617,
	"kl": 0.005807638866826892,
	"learning_rate": 1.4285714285714286e-06,
	"loss": 0.0001,
	"num_tokens": 1228816.0,
	"reward": 1.0125000476837158,
	"reward_std": 0.013363069854676723,
	"rewards/oai_reward_function/mean": 0.5062500000931323,
	"rewards/oai_reward_function/std": 0.016800537705421448,
	"step": 69
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10368440486490726,
	"epoch": 1.0,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.050165776163339615,
	"kl": 0.008004623581655324,
	"learning_rate": 7.142857142857143e-07,
	"loss": 0.0001,
	"num_tokens": 1246584.0,
	"reward": 1.017187476158142,
	"reward_std": 0.017598580569028854,
	"rewards/oai_reward_function/mean": 0.5085937501862645,
	"rewards/oai_reward_function/std": 0.022548669949173927,
	"step": 70
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09804531745612621,
	"epoch": 1.0142857142857142,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.05341744422912598,
	"kl": 0.01710776425898075,
	"learning_rate": 0.0,
	"loss": 0.0002,
	"num_tokens": 1264416.0,
	"reward": 1.0875000953674316,
	"reward_std": 0.03174196928739548,
	"rewards/oai_reward_function/mean": 0.5437500029802322,
	"rewards/oai_reward_function/std": 0.0375671461224556,
	"step": 71
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09854021109640598,
	"epoch": 1.0285714285714285,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06506258249282837,
	"kl": 0.009508747374638915,
	"learning_rate": 4.4928571428571434e-05,
	"loss": 0.0001,
	"num_tokens": 1282296.0,
	"reward": 1.0406250953674316,
	"reward_std": 0.0222018975764513,
	"rewards/oai_reward_function/mean": 0.5203125011175871,
	"rewards/oai_reward_function/std": 0.035603947937488556,
	"step": 72
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08583058044314384,
	"epoch": 1.042857142857143,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07061073184013367,
	"kl": 0.005951485480181873,
	"learning_rate": 4.485714285714286e-05,
	"loss": 0.0001,
	"num_tokens": 1300008.0,
	"reward": 1.0234375,
	"reward_std": 0.01804211549460888,
	"rewards/oai_reward_function/mean": 0.51171875,
	"rewards/oai_reward_function/std": 0.020064787939190865,
	"step": 73
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09507345780730247,
	"epoch": 1.0571428571428572,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.05930415913462639,
	"kl": 0.007875082548707724,
	"learning_rate": 4.478571428571429e-05,
	"loss": 0.0001,
	"num_tokens": 1317832.0,
	"reward": 1.234375,
	"reward_std": 0.01088879257440567,
	"rewards/oai_reward_function/mean": 0.6171875,
	"rewards/oai_reward_function/std": 0.20452910661697388,
	"step": 74
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09059166349470615,
	"epoch": 1.0714285714285714,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.04293040931224823,
	"kl": 0.01093563821632415,
	"learning_rate": 4.471428571428571e-05,
	"loss": 0.0001,
	"num_tokens": 1335584.0,
	"reward": 1.0281250476837158,
	"reward_std": 0.008838837966322899,
	"rewards/oai_reward_function/mean": 0.514062499627471,
	"rewards/oai_reward_function/std": 0.026133574545383453,
	"step": 75
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08499786630272865,
	"epoch": 1.0857142857142856,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06890492141246796,
	"kl": 0.007947787176817656,
	"learning_rate": 4.464285714285715e-05,
	"loss": 0.0001,
	"num_tokens": 1353400.0,
	"reward": 1.0734374523162842,
	"reward_std": 0.03388907015323639,
	"rewards/oai_reward_function/mean": 0.5367187522351742,
	"rewards/oai_reward_function/std": 0.05607611685991287,
	"step": 76
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08190344646573067,
	"epoch": 1.1,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07835045456886292,
	"kl": 0.010214838432148099,
	"learning_rate": 4.4571428571428574e-05,
	"loss": 0.0001,
	"num_tokens": 1371176.0,
	"reward": 1.21875,
	"reward_std": 0.03153933212161064,
	"rewards/oai_reward_function/mean": 0.609375,
	"rewards/oai_reward_function/std": 0.12727762758731842,
	"step": 77
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09901309013366699,
	"epoch": 1.1142857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.07858891785144806,
	"kl": 0.006452999892644584,
	"learning_rate": 4.4500000000000004e-05,
	"loss": 0.0001,
	"num_tokens": 1388952.0,
	"reward": 1.053125023841858,
	"reward_std": 0.028757737949490547,
	"rewards/oai_reward_function/mean": 0.5265625007450581,
	"rewards/oai_reward_function/std": 0.02905604988336563,
	"step": 78
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09012427926063538,
	"epoch": 1.1285714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.08503545820713043,
	"kl": 0.01038876292295754,
	"learning_rate": 4.442857142857143e-05,
	"loss": 0.0001,
	"num_tokens": 1406744.0,
	"reward": 1.084375023841858,
	"reward_std": 0.07080081105232239,
	"rewards/oai_reward_function/mean": 0.5421874970197678,
	"rewards/oai_reward_function/std": 0.09233474731445312,
	"step": 79
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07716062478721142,
	"epoch": 1.1428571428571428,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.0729447677731514,
	"kl": 0.012507579056546092,
	"learning_rate": 4.435714285714286e-05,
	"loss": 0.0001,
	"num_tokens": 1424568.0,
	"reward": 1.2468750476837158,
	"reward_std": 0.03139737993478775,
	"rewards/oai_reward_function/mean": 0.6234374940395355,
	"rewards/oai_reward_function/std": 0.19123300909996033,
	"step": 80
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09090105071663857,
	"epoch": 1.157142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.07926075905561447,
	"kl": 0.010987127898260951,
	"learning_rate": 4.428571428571428e-05,
	"loss": 0.0001,
	"num_tokens": 1442480.0,
	"reward": 1.0984375476837158,
	"reward_std": 0.0697232112288475,
	"rewards/oai_reward_function/mean": 0.5492187514901161,
	"rewards/oai_reward_function/std": 0.06006864085793495,
	"step": 81
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08928278088569641,
	"epoch": 1.1714285714285715,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08457206189632416,
	"kl": 0.004951049922965467,
	"learning_rate": 4.4214285714285714e-05,
	"loss": 0.0,
	"num_tokens": 1460344.0,
	"reward": 1.0875000953674316,
	"reward_std": 0.04518735408782959,
	"rewards/oai_reward_function/mean": 0.5437499992549419,
	"rewards/oai_reward_function/std": 0.07156093418598175,
	"step": 82
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08692280948162079,
	"epoch": 1.1857142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09246931225061417,
	"kl": 0.015749768121168017,
	"learning_rate": 4.4142857142857144e-05,
	"loss": 0.0002,
	"num_tokens": 1478248.0,
	"reward": 1.264062523841858,
	"reward_std": 0.03826536983251572,
	"rewards/oai_reward_function/mean": 0.6320312470197678,
	"rewards/oai_reward_function/std": 0.17668935656547546,
	"step": 83
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08908558450639248,
	"epoch": 1.2,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05387440696358681,
	"kl": 0.0058196637546643615,
	"learning_rate": 4.4071428571428575e-05,
	"loss": 0.0001,
	"num_tokens": 1496112.0,
	"reward": 1.0078125,
	"reward_std": 0.011451572179794312,
	"rewards/oai_reward_function/mean": 0.50390625,
	"rewards/oai_reward_function/std": 0.012872475199401379,
	"step": 84
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08258137106895447,
	"epoch": 1.2142857142857142,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05658518522977829,
	"kl": 0.004440092074219137,
	"learning_rate": 4.4000000000000006e-05,
	"loss": 0.0,
	"num_tokens": 1513752.0,
	"reward": 1.001562476158142,
	"reward_std": 0.004419416189193726,
	"rewards/oai_reward_function/mean": 0.5007812500116415,
	"rewards/oai_reward_function/std": 0.0044194175861775875,
	"step": 85
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07238267548382282,
	"epoch": 1.2285714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07421422004699707,
	"kl": 0.006456690724007785,
	"learning_rate": 4.392857142857143e-05,
	"loss": 0.0001,
	"num_tokens": 1531512.0,
	"reward": 1.048437476158142,
	"reward_std": 0.023024337366223335,
	"rewards/oai_reward_function/mean": 0.5242187492549419,
	"rewards/oai_reward_function/std": 0.030772563070058823,
	"step": 86
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08944158256053925,
	"epoch": 1.2428571428571429,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06827304512262344,
	"kl": 0.006732087349519134,
	"learning_rate": 4.385714285714286e-05,
	"loss": 0.0001,
	"num_tokens": 1549400.0,
	"reward": 1.1703124046325684,
	"reward_std": 0.06780597567558289,
	"rewards/oai_reward_function/mean": 0.5851562544703484,
	"rewards/oai_reward_function/std": 0.15987133979797363,
	"step": 87
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08645510673522949,
	"epoch": 1.2571428571428571,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06966782361268997,
	"kl": 0.008698969963006675,
	"learning_rate": 4.3785714285714284e-05,
	"loss": 0.0001,
	"num_tokens": 1567168.0,
	"reward": 1.0187499523162842,
	"reward_std": 0.018725106492638588,
	"rewards/oai_reward_function/mean": 0.509375000372529,
	"rewards/oai_reward_function/std": 0.01878357119858265,
	"step": 88
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10576100833714008,
	"epoch": 1.2714285714285714,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.04759611934423447,
	"kl": 0.009460748406127095,
	"learning_rate": 4.371428571428572e-05,
	"loss": 0.0001,
	"num_tokens": 1585032.0,
	"reward": 1.0812499523162842,
	"reward_std": 0.07165143638849258,
	"rewards/oai_reward_function/mean": 0.5406249985098839,
	"rewards/oai_reward_function/std": 0.0987318754196167,
	"step": 89
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07568562775850296,
	"epoch": 1.2857142857142856,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.052769020199775696,
	"kl": 0.005130159552209079,
	"learning_rate": 4.3642857142857146e-05,
	"loss": 0.0001,
	"num_tokens": 1602784.0,
	"reward": 1.0562500953674316,
	"reward_std": 0.03836483508348465,
	"rewards/oai_reward_function/mean": 0.5281250011175871,
	"rewards/oai_reward_function/std": 0.03952847048640251,
	"step": 90
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08074977435171604,
	"epoch": 1.3,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07572436332702637,
	"kl": 0.00757291610352695,
	"learning_rate": 4.3571428571428576e-05,
	"loss": 0.0001,
	"num_tokens": 1620496.0,
	"reward": 1.0546875,
	"reward_std": 0.032445792108774185,
	"rewards/oai_reward_function/mean": 0.52734375,
	"rewards/oai_reward_function/std": 0.03321446478366852,
	"step": 91
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07088322378695011,
	"epoch": 1.3142857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.07621358335018158,
	"kl": 0.005998459528200328,
	"learning_rate": 4.35e-05,
	"loss": 0.0001,
	"num_tokens": 1638464.0,
	"reward": 1.2015624046325684,
	"reward_std": 0.11482575535774231,
	"rewards/oai_reward_function/mean": 0.6007812544703484,
	"rewards/oai_reward_function/std": 0.26282399147748947,
	"step": 92
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06381132267415524,
	"epoch": 1.3285714285714285,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.030587607994675636,
	"kl": 0.007369687547907233,
	"learning_rate": 4.342857142857143e-05,
	"loss": 0.0001,
	"num_tokens": 1656176.0,
	"reward": 1.0031249523162842,
	"reward_std": 0.008838832378387451,
	"rewards/oai_reward_function/mean": 0.5015625000232831,
	"rewards/oai_reward_function/std": 0.008838835172355175,
	"step": 93
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07285293377935886,
	"epoch": 1.342857142857143,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.0500265508890152,
	"kl": 0.006194314104504883,
	"learning_rate": 4.3357142857142855e-05,
	"loss": 0.0001,
	"num_tokens": 1674088.0,
	"reward": 1.109375,
	"reward_std": 0.04590248316526413,
	"rewards/oai_reward_function/mean": 0.5546875037252903,
	"rewards/oai_reward_function/std": 0.07967613637447357,
	"step": 94
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.086557412520051,
	"epoch": 1.3571428571428572,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07414961606264114,
	"kl": 0.010996793280355632,
	"learning_rate": 4.328571428571429e-05,
	"loss": 0.0001,
	"num_tokens": 1692048.0,
	"reward": 1.0671875476837158,
	"reward_std": 0.03708447515964508,
	"rewards/oai_reward_function/mean": 0.5335937514901161,
	"rewards/oai_reward_function/std": 0.04561823233962059,
	"step": 95
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08167718537151814,
	"epoch": 1.3714285714285714,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0010896283201873302,
	"kl": 0.004855156294070184,
	"learning_rate": 4.3214285714285716e-05,
	"loss": 0.0,
	"num_tokens": 1709760.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 96
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07798840664327145,
	"epoch": 1.3857142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09227096289396286,
	"kl": 0.014819784788414836,
	"learning_rate": 4.314285714285715e-05,
	"loss": 0.0001,
	"num_tokens": 1727480.0,
	"reward": 1.2296874523162842,
	"reward_std": 0.1029118224978447,
	"rewards/oai_reward_function/mean": 0.6148437485098839,
	"rewards/oai_reward_function/std": 0.12918156385421753,
	"step": 97
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07132465578615665,
	"epoch": 1.4,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07690515369176865,
	"kl": 0.0082227170933038,
	"learning_rate": 4.307142857142857e-05,
	"loss": 0.0001,
	"num_tokens": 1745336.0,
	"reward": 1.037500023841858,
	"reward_std": 0.023145508021116257,
	"rewards/oai_reward_function/mean": 0.5187500007450581,
	"rewards/oai_reward_function/std": 0.030453559011220932,
	"step": 98
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08682013675570488,
	"epoch": 1.4142857142857144,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09899340569972992,
	"kl": 0.007965923519805074,
	"learning_rate": 4.3e-05,
	"loss": 0.0001,
	"num_tokens": 1763136.0,
	"reward": 1.1531250476837158,
	"reward_std": 0.13869836926460266,
	"rewards/oai_reward_function/mean": 0.5765625014901161,
	"rewards/oai_reward_function/std": 0.2821534648537636,
	"step": 99
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08919607102870941,
	"epoch": 1.4285714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.080818772315979,
	"kl": 0.007981272647157311,
	"learning_rate": 4.292857142857143e-05,
	"loss": 0.0001,
	"num_tokens": 1780920.0,
	"reward": 1.2093749046325684,
	"reward_std": 0.020411580801010132,
	"rewards/oai_reward_function/mean": 0.6046875044703484,
	"rewards/oai_reward_function/std": 0.18110741674900055,
	"step": 100
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07852962799370289,
	"epoch": 1.4428571428571428,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.09296028316020966,
	"kl": 0.012157463701441884,
	"learning_rate": 4.2857142857142856e-05,
	"loss": 0.0001,
	"num_tokens": 1798872.0,
	"reward": 1.5265624523162842,
	"reward_std": 0.04206090793013573,
	"rewards/oai_reward_function/mean": 0.7632812559604645,
	"rewards/oai_reward_function/std": 0.22771519422531128,
	"step": 101
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.086120730265975,
	"epoch": 1.457142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.0995735228061676,
	"kl": 0.012111627496778965,
	"learning_rate": 4.278571428571429e-05,
	"loss": 0.0001,
	"num_tokens": 1816744.0,
	"reward": 1.0968749523162842,
	"reward_std": 0.049927353858947754,
	"rewards/oai_reward_function/mean": 0.5484374985098839,
	"rewards/oai_reward_function/std": 0.049974795430898666,
	"step": 102
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07998536713421345,
	"epoch": 1.4714285714285715,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06524667888879776,
	"kl": 0.006055153091438115,
	"learning_rate": 4.271428571428572e-05,
	"loss": 0.0001,
	"num_tokens": 1834456.0,
	"reward": 1.0125000476837158,
	"reward_std": 0.02314549870789051,
	"rewards/oai_reward_function/mean": 0.5062500000931323,
	"rewards/oai_reward_function/std": 0.016800537705421448,
	"step": 103
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07001950591802597,
	"epoch": 1.4857142857142858,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.11485958099365234,
	"kl": 0.00914135156199336,
	"learning_rate": 4.264285714285715e-05,
	"loss": 0.0001,
	"num_tokens": 1852408.0,
	"reward": 1.1859374046325684,
	"reward_std": 0.08434940874576569,
	"rewards/oai_reward_function/mean": 0.5929687544703484,
	"rewards/oai_reward_function/std": 0.09821683913469315,
	"step": 104
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07728070393204689,
	"epoch": 1.5,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07163766771554947,
	"kl": 0.006072127376683056,
	"learning_rate": 4.257142857142857e-05,
	"loss": 0.0001,
	"num_tokens": 1870208.0,
	"reward": 1.0125000476837158,
	"reward_std": 0.02314549870789051,
	"rewards/oai_reward_function/mean": 0.5062500000931323,
	"rewards/oai_reward_function/std": 0.016800537705421448,
	"step": 105
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08027334697544575,
	"epoch": 1.5142857142857142,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07313752919435501,
	"kl": 0.011275349417701364,
	"learning_rate": 4.25e-05,
	"loss": 0.0001,
	"num_tokens": 1887984.0,
	"reward": 1.0593750476837158,
	"reward_std": 0.022558562457561493,
	"rewards/oai_reward_function/mean": 0.529687499627471,
	"rewards/oai_reward_function/std": 0.03386256843805313,
	"step": 106
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08169634826481342,
	"epoch": 1.5285714285714285,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.09352786093950272,
	"kl": 0.014267339138314128,
	"learning_rate": 4.242857142857143e-05,
	"loss": 0.0001,
	"num_tokens": 1905776.0,
	"reward": 1.115625023841858,
	"reward_std": 0.055196452885866165,
	"rewards/oai_reward_function/mean": 0.5578125044703484,
	"rewards/oai_reward_function/std": 0.09427942335605621,
	"step": 107
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05949794687330723,
	"epoch": 1.5428571428571427,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.08965466171503067,
	"kl": 0.014173903269693255,
	"learning_rate": 4.2357142857142864e-05,
	"loss": 0.0001,
	"num_tokens": 1923672.0,
	"reward": 1.357812523841858,
	"reward_std": 0.07401138544082642,
	"rewards/oai_reward_function/mean": 0.6789062470197678,
	"rewards/oai_reward_function/std": 0.1690949946641922,
	"step": 108
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08150264620780945,
	"epoch": 1.5571428571428572,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08684907853603363,
	"kl": 0.015842870343476534,
	"learning_rate": 4.228571428571429e-05,
	"loss": 0.0002,
	"num_tokens": 1941504.0,
	"reward": 1.334375023841858,
	"reward_std": 0.03491953760385513,
	"rewards/oai_reward_function/mean": 0.6671874970197678,
	"rewards/oai_reward_function/std": 0.18364998698234558,
	"step": 109
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06982677057385445,
	"epoch": 1.5714285714285714,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06661536544561386,
	"kl": 0.008391729556024075,
	"learning_rate": 4.221428571428572e-05,
	"loss": 0.0001,
	"num_tokens": 1959144.0,
	"reward": 1.03125,
	"reward_std": 0.019731827080249786,
	"rewards/oai_reward_function/mean": 0.515625,
	"rewards/oai_reward_function/std": 0.025988519191741943,
	"step": 110
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09021224454045296,
	"epoch": 1.5857142857142859,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05824963003396988,
	"kl": 0.008994318312034011,
	"learning_rate": 4.214285714285714e-05,
	"loss": 0.0001,
	"num_tokens": 1976936.0,
	"reward": 1.037500023841858,
	"reward_std": 0.013363069854676723,
	"rewards/oai_reward_function/mean": 0.5187500007450581,
	"rewards/oai_reward_function/std": 0.0353553369641304,
	"step": 111
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09183148294687271,
	"epoch": 1.6,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.08340641111135483,
	"kl": 0.010920959059149027,
	"learning_rate": 4.2071428571428574e-05,
	"loss": 0.0001,
	"num_tokens": 1994664.0,
	"reward": 1.0390625,
	"reward_std": 0.027564914897084236,
	"rewards/oai_reward_function/mean": 0.51953125,
	"rewards/oai_reward_function/std": 0.0395205020904541,
	"step": 112
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08369805663824081,
	"epoch": 1.6142857142857143,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.09711972624063492,
	"kl": 0.009947408339940012,
	"learning_rate": 4.2e-05,
	"loss": 0.0001,
	"num_tokens": 2012432.0,
	"reward": 1.0703125,
	"reward_std": 0.02308514341711998,
	"rewards/oai_reward_function/mean": 0.53515625,
	"rewards/oai_reward_function/std": 0.04438621550798416,
	"step": 113
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.096822340041399,
	"epoch": 1.6285714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07382018864154816,
	"kl": 0.017859197221696377,
	"learning_rate": 4.192857142857143e-05,
	"loss": 0.0002,
	"num_tokens": 2030136.0,
	"reward": 1.0359375476837158,
	"reward_std": 0.035533398389816284,
	"rewards/oai_reward_function/mean": 0.517968749627471,
	"rewards/oai_reward_function/std": 0.03252441808581352,
	"step": 114
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08726120926439762,
	"epoch": 1.6428571428571428,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.0782691165804863,
	"kl": 0.012930417666211724,
	"learning_rate": 4.185714285714286e-05,
	"loss": 0.0001,
	"num_tokens": 2047976.0,
	"reward": 1.1749999523162842,
	"reward_std": 0.08762745559215546,
	"rewards/oai_reward_function/mean": 0.5874999985098839,
	"rewards/oai_reward_function/std": 0.1177750751376152,
	"step": 115
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07392177730798721,
	"epoch": 1.657142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.07505157589912415,
	"kl": 0.012273511849343777,
	"learning_rate": 4.178571428571429e-05,
	"loss": 0.0001,
	"num_tokens": 2065704.0,
	"reward": 1.1953125,
	"reward_std": 0.06742400676012039,
	"rewards/oai_reward_function/mean": 0.59765625,
	"rewards/oai_reward_function/std": 0.13019207119941711,
	"step": 116
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.065590625628829,
	"epoch": 1.6714285714285713,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08693981915712357,
	"kl": 0.02323699276894331,
	"learning_rate": 4.1714285714285714e-05,
	"loss": 0.0002,
	"num_tokens": 2083560.0,
	"reward": 1.1328125,
	"reward_std": 0.04593653976917267,
	"rewards/oai_reward_function/mean": 0.56640625,
	"rewards/oai_reward_function/std": 0.04902656376361847,
	"step": 117
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09886737167835236,
	"epoch": 1.6857142857142857,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05793704837560654,
	"kl": 0.015232619596645236,
	"learning_rate": 4.1642857142857144e-05,
	"loss": 0.0002,
	"num_tokens": 2101392.0,
	"reward": 1.03125,
	"reward_std": 0.011572758667171001,
	"rewards/oai_reward_function/mean": 0.515625,
	"rewards/oai_reward_function/std": 0.029614457860589027,
	"step": 118
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09572554007172585,
	"epoch": 1.7,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.03639426827430725,
	"kl": 0.017274728044867516,
	"learning_rate": 4.1571428571428575e-05,
	"loss": 0.0002,
	"num_tokens": 2119200.0,
	"reward": 1.2000000476837158,
	"reward_std": 0.018898215144872665,
	"rewards/oai_reward_function/mean": 0.5999999940395355,
	"rewards/oai_reward_function/std": 0.17689070105552673,
	"step": 119
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08603023178875446,
	"epoch": 1.7142857142857144,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.10388008505105972,
	"kl": 0.022097071167081594,
	"learning_rate": 4.15e-05,
	"loss": 0.0002,
	"num_tokens": 2137056.0,
	"reward": 1.0578125715255737,
	"reward_std": 0.04522190988063812,
	"rewards/oai_reward_function/mean": 0.528906250372529,
	"rewards/oai_reward_function/std": 0.03971134498715401,
	"step": 120
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0950616579502821,
	"epoch": 1.7285714285714286,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09005559235811234,
	"kl": 0.015552334254607558,
	"learning_rate": 4.1428571428571437e-05,
	"loss": 0.0002,
	"num_tokens": 2154936.0,
	"reward": 1.268125057220459,
	"reward_std": 0.0341712087392807,
	"rewards/oai_reward_function/mean": 0.6340624988079071,
	"rewards/oai_reward_function/std": 0.2080029398202896,
	"step": 121
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07687668316066265,
	"epoch": 1.7428571428571429,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.08279760181903839,
	"kl": 0.009949938859790564,
	"learning_rate": 4.135714285714286e-05,
	"loss": 0.0001,
	"num_tokens": 2172616.0,
	"reward": 1.0625,
	"reward_std": 0.04204372316598892,
	"rewards/oai_reward_function/mean": 0.53125,
	"rewards/oai_reward_function/std": 0.04353345185518265,
	"step": 122
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08731912076473236,
	"epoch": 1.7571428571428571,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06586393713951111,
	"kl": 0.008837034576572478,
	"learning_rate": 4.128571428571429e-05,
	"loss": 0.0001,
	"num_tokens": 2190368.0,
	"reward": 1.0187499523162842,
	"reward_std": 0.021777570247650146,
	"rewards/oai_reward_function/mean": 0.509375000372529,
	"rewards/oai_reward_function/std": 0.019827887415885925,
	"step": 123
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08952882327139378,
	"epoch": 1.7714285714285714,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.09899036586284637,
	"kl": 0.013539569219574332,
	"learning_rate": 4.1214285714285715e-05,
	"loss": 0.0001,
	"num_tokens": 2208176.0,
	"reward": 1.1140625476837158,
	"reward_std": 0.06264616549015045,
	"rewards/oai_reward_function/mean": 0.5570312514901161,
	"rewards/oai_reward_function/std": 0.04455622285604477,
	"step": 124
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07565303146839142,
	"epoch": 1.7857142857142856,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08115291595458984,
	"kl": 0.009995393920689821,
	"learning_rate": 4.1142857142857146e-05,
	"loss": 0.0001,
	"num_tokens": 2225896.0,
	"reward": 1.1749999523162842,
	"reward_std": 0.06661029160022736,
	"rewards/oai_reward_function/mean": 0.5874999985098839,
	"rewards/oai_reward_function/std": 0.1399884670972824,
	"step": 125
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09428555145859718,
	"epoch": 1.8,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.06345849484205246,
	"kl": 0.01248577213846147,
	"learning_rate": 4.107142857142857e-05,
	"loss": 0.0001,
	"num_tokens": 2243656.0,
	"reward": 1.0390625,
	"reward_std": 0.02052600309252739,
	"rewards/oai_reward_function/mean": 0.51953125,
	"rewards/oai_reward_function/std": 0.0395205020904541,
	"step": 126
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07595096342265606,
	"epoch": 1.8142857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09258188307285309,
	"kl": 0.00919699075166136,
	"learning_rate": 4.1e-05,
	"loss": 0.0001,
	"num_tokens": 2261448.0,
	"reward": 1.0343749523162842,
	"reward_std": 0.03808924928307533,
	"rewards/oai_reward_function/mean": 0.517187500372529,
	"rewards/oai_reward_function/std": 0.029400940984487534,
	"step": 127
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08019419759511948,
	"epoch": 1.8285714285714287,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.13585439324378967,
	"kl": 0.019885767716914415,
	"learning_rate": 4.092857142857143e-05,
	"loss": 0.0002,
	"num_tokens": 2279240.0,
	"reward": 1.3046875,
	"reward_std": 0.1113169863820076,
	"rewards/oai_reward_function/mean": 0.65234375,
	"rewards/oai_reward_function/std": 0.17964564263820648,
	"step": 128
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09248529188334942,
	"epoch": 1.842857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08260349929332733,
	"kl": 0.013745760545134544,
	"learning_rate": 4.085714285714286e-05,
	"loss": 0.0001,
	"num_tokens": 2297056.0,
	"reward": 1.0734375715255737,
	"reward_std": 0.034589797258377075,
	"rewards/oai_reward_function/mean": 0.5367187522351742,
	"rewards/oai_reward_function/std": 0.03359169885516167,
	"step": 129
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09751161187887192,
	"epoch": 1.8571428571428572,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.06302473694086075,
	"kl": 0.011824949877336621,
	"learning_rate": 4.0785714285714286e-05,
	"loss": 0.0001,
	"num_tokens": 2314872.0,
	"reward": 1.0203125476837158,
	"reward_std": 0.013258256018161774,
	"rewards/oai_reward_function/mean": 0.510156249627471,
	"rewards/oai_reward_function/std": 0.021867798641324043,
	"step": 130
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09040896967053413,
	"epoch": 1.8714285714285714,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09831514209508896,
	"kl": 0.01766400644555688,
	"learning_rate": 4.0714285714285717e-05,
	"loss": 0.0002,
	"num_tokens": 2332784.0,
	"reward": 1.0187499523162842,
	"reward_std": 0.028380058705806732,
	"rewards/oai_reward_function/mean": 0.509375000372529,
	"rewards/oai_reward_function/std": 0.01878357119858265,
	"step": 131
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09301545284688473,
	"epoch": 1.8857142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.08136897534132004,
	"kl": 0.02844544965773821,
	"learning_rate": 4.064285714285714e-05,
	"loss": 0.0003,
	"num_tokens": 2350640.0,
	"reward": 1.060937523841858,
	"reward_std": 0.03093591332435608,
	"rewards/oai_reward_function/mean": 0.5304687507450581,
	"rewards/oai_reward_function/std": 0.054895199835300446,
	"step": 132
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08573882840573788,
	"epoch": 1.9,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.0735137015581131,
	"kl": 0.023084456101059914,
	"learning_rate": 4.057142857142857e-05,
	"loss": 0.0002,
	"num_tokens": 2368456.0,
	"reward": 1.0734374523162842,
	"reward_std": 0.02894335612654686,
	"rewards/oai_reward_function/mean": 0.5367187522351742,
	"rewards/oai_reward_function/std": 0.05461905151605606,
	"step": 133
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10109273716807365,
	"epoch": 1.9142857142857141,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.10477015376091003,
	"kl": 0.03489594021812081,
	"learning_rate": 4.05e-05,
	"loss": 0.0003,
	"num_tokens": 2386200.0,
	"reward": 1.0046875476837158,
	"reward_std": 0.11875393241643906,
	"rewards/oai_reward_function/mean": 0.5023437514901161,
	"rewards/oai_reward_function/std": 0.11234594881534576,
	"step": 134
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08120713755488396,
	"epoch": 1.9285714285714286,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.10714246332645416,
	"kl": 0.024183190893381834,
	"learning_rate": 4.042857142857143e-05,
	"loss": 0.0002,
	"num_tokens": 2404056.0,
	"reward": 1.09375,
	"reward_std": 0.05726175755262375,
	"rewards/oai_reward_function/mean": 0.546875,
	"rewards/oai_reward_function/std": 0.05982164293527603,
	"step": 135
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.089906245470047,
	"epoch": 1.9428571428571428,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.12431693077087402,
	"kl": 0.038056795950978994,
	"learning_rate": 4.035714285714286e-05,
	"loss": 0.0004,
	"num_tokens": 2421928.0,
	"reward": 1.3406250476837158,
	"reward_std": 0.05260005593299866,
	"rewards/oai_reward_function/mean": 0.6703125089406967,
	"rewards/oai_reward_function/std": 0.19317355751991272,
	"step": 136
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09174446761608124,
	"epoch": 1.9571428571428573,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07566652446985245,
	"kl": 0.023516141809523106,
	"learning_rate": 4.028571428571429e-05,
	"loss": 0.0002,
	"num_tokens": 2439832.0,
	"reward": 1.0640625953674316,
	"reward_std": 0.026437407359480858,
	"rewards/oai_reward_function/mean": 0.5320312529802322,
	"rewards/oai_reward_function/std": 0.04027845337986946,
	"step": 137
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0811004675924778,
	"epoch": 1.9714285714285715,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1272999793291092,
	"kl": 0.035705497954040766,
	"learning_rate": 4.021428571428572e-05,
	"loss": 0.0004,
	"num_tokens": 2457576.0,
	"reward": 1.0421874523162842,
	"reward_std": 0.04250866919755936,
	"rewards/oai_reward_function/mean": 0.521093750372529,
	"rewards/oai_reward_function/std": 0.03971134498715401,
	"step": 138
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08940452709794044,
	"epoch": 1.9857142857142858,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.10179316252470016,
	"kl": 0.03542056027799845,
	"learning_rate": 4.014285714285714e-05,
	"loss": 0.0004,
	"num_tokens": 2475400.0,
	"reward": 1.0484375953674316,
	"reward_std": 0.03541836887598038,
	"rewards/oai_reward_function/mean": 0.5242187511175871,
	"rewards/oai_reward_function/std": 0.04375720024108887,
	"step": 139
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08982641063630581,
	"epoch": 2.0,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05322287976741791,
	"kl": 0.024960508104413748,
	"learning_rate": 4.007142857142857e-05,
	"loss": 0.0002,
	"num_tokens": 2493168.0,
	"reward": 1.0421874523162842,
	"reward_std": 0.024032622575759888,
	"rewards/oai_reward_function/mean": 0.521093750372529,
	"rewards/oai_reward_function/std": 0.04358407482504845,
	"step": 140
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07347713969647884,
	"epoch": 2.0142857142857142,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.054996706545352936,
	"kl": 0.029410825110971928,
	"learning_rate": 4e-05,
	"loss": 0.0003,
	"num_tokens": 2510968.0,
	"reward": 1.2937500476837158,
	"reward_std": 0.006681524682790041,
	"rewards/oai_reward_function/mean": 0.6468750089406967,
	"rewards/oai_reward_function/std": 0.2041652947664261,
	"step": 141
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11200576089322567,
	"epoch": 2.0285714285714285,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07548272609710693,
	"kl": 0.03502320311963558,
	"learning_rate": 3.9928571428571434e-05,
	"loss": 0.0004,
	"num_tokens": 2528744.0,
	"reward": 1.095312476158142,
	"reward_std": 0.0437462255358696,
	"rewards/oai_reward_function/mean": 0.5476562529802322,
	"rewards/oai_reward_function/std": 0.05692360922694206,
	"step": 142
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09486313536763191,
	"epoch": 2.0428571428571427,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05399833247065544,
	"kl": 0.03851825185120106,
	"learning_rate": 3.985714285714286e-05,
	"loss": 0.0004,
	"num_tokens": 2546488.0,
	"reward": 1.0125000476837158,
	"reward_std": 0.01336306519806385,
	"rewards/oai_reward_function/mean": 0.5062500000931323,
	"rewards/oai_reward_function/std": 0.016800537705421448,
	"step": 143
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08246604166924953,
	"epoch": 2.057142857142857,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.042957283556461334,
	"kl": 0.03783240728080273,
	"learning_rate": 3.978571428571429e-05,
	"loss": 0.0004,
	"num_tokens": 2564176.0,
	"reward": 1.0234375,
	"reward_std": 0.01695253700017929,
	"rewards/oai_reward_function/mean": 0.51171875,
	"rewards/oai_reward_function/std": 0.026169713586568832,
	"step": 144
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10432570241391659,
	"epoch": 2.0714285714285716,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09850599616765976,
	"kl": 0.037014870904386044,
	"learning_rate": 3.971428571428571e-05,
	"loss": 0.0004,
	"num_tokens": 2581944.0,
	"reward": 1.0250000953674316,
	"reward_std": 0.15622428059577942,
	"rewards/oai_reward_function/mean": 0.5124999992549419,
	"rewards/oai_reward_function/std": 0.17416272684931755,
	"step": 145
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09751013852655888,
	"epoch": 2.085714285714286,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.060189735144376755,
	"kl": 0.050427704118192196,
	"learning_rate": 3.964285714285714e-05,
	"loss": 0.0005,
	"num_tokens": 2599616.0,
	"reward": 1.0265624523162842,
	"reward_std": 0.008010865189135075,
	"rewards/oai_reward_function/mean": 0.513281250372529,
	"rewards/oai_reward_function/std": 0.024580655619502068,
	"step": 146
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.1012180857360363,
	"epoch": 2.1,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09098206460475922,
	"kl": 0.05104807484894991,
	"learning_rate": 3.9571428571428574e-05,
	"loss": 0.0005,
	"num_tokens": 2617576.0,
	"reward": 1.2890625,
	"reward_std": 0.033694587647914886,
	"rewards/oai_reward_function/mean": 0.6445312350988388,
	"rewards/oai_reward_function/std": 0.1918431520462036,
	"step": 147
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08674592711031437,
	"epoch": 2.1142857142857143,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.0632624626159668,
	"kl": 0.035351223312318325,
	"learning_rate": 3.9500000000000005e-05,
	"loss": 0.0004,
	"num_tokens": 2635248.0,
	"reward": 1.0265624523162842,
	"reward_std": 0.01813914254307747,
	"rewards/oai_reward_function/mean": 0.513281250372529,
	"rewards/oai_reward_function/std": 0.021982740610837936,
	"step": 148
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11086461879312992,
	"epoch": 2.1285714285714286,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.13065817952156067,
	"kl": 0.06040171813219786,
	"learning_rate": 3.942857142857143e-05,
	"loss": 0.0006,
	"num_tokens": 2653096.0,
	"reward": 1.037500023841858,
	"reward_std": 0.14793866872787476,
	"rewards/oai_reward_function/mean": 0.5187500044703484,
	"rewards/oai_reward_function/std": 0.12740343809127808,
	"step": 149
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.11983237601816654,
	"epoch": 2.142857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11544425040483475,
	"kl": 0.06588536128401756,
	"learning_rate": 3.935714285714286e-05,
	"loss": 0.0007,
	"num_tokens": 2670944.0,
	"reward": 1.0812499523162842,
	"reward_std": 0.035140641033649445,
	"rewards/oai_reward_function/mean": 0.5406249985098839,
	"rewards/oai_reward_function/std": 0.023546453565359116,
	"step": 150
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08963452652096748,
	"epoch": 2.157142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.06575662642717361,
	"kl": 0.05113219376653433,
	"learning_rate": 3.928571428571429e-05,
	"loss": 0.0005,
	"num_tokens": 2688680.0,
	"reward": 1.154687523841858,
	"reward_std": 0.03592789173126221,
	"rewards/oai_reward_function/mean": 0.5773437470197678,
	"rewards/oai_reward_function/std": 0.13520585000514984,
	"step": 151
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10959535092115402,
	"epoch": 2.1714285714285713,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09100169688463211,
	"kl": 0.04979555029422045,
	"learning_rate": 3.9214285714285714e-05,
	"loss": 0.0005,
	"num_tokens": 2706528.0,
	"reward": 1.3046875,
	"reward_std": 0.032156482338905334,
	"rewards/oai_reward_function/mean": 0.6523437350988388,
	"rewards/oai_reward_function/std": 0.1942063421010971,
	"step": 152
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10072515532374382,
	"epoch": 2.185714285714286,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.11344427615404129,
	"kl": 0.08104220405220985,
	"learning_rate": 3.9142857142857145e-05,
	"loss": 0.0008,
	"num_tokens": 2724424.0,
	"reward": 1.3984375,
	"reward_std": 0.06430189311504364,
	"rewards/oai_reward_function/mean": 0.69921875,
	"rewards/oai_reward_function/std": 0.19055142998695374,
	"step": 153
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.13380656391382217,
	"epoch": 2.2,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.10259576886892319,
	"kl": 0.047852903604507446,
	"learning_rate": 3.9071428571428575e-05,
	"loss": 0.0005,
	"num_tokens": 2742272.0,
	"reward": 1.0578124523162842,
	"reward_std": 0.026579542085528374,
	"rewards/oai_reward_function/mean": 0.5289062485098839,
	"rewards/oai_reward_function/std": 0.05354730039834976,
	"step": 154
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.1082126721739769,
	"epoch": 2.2142857142857144,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.10249610245227814,
	"kl": 0.07078076247125864,
	"learning_rate": 3.9000000000000006e-05,
	"loss": 0.0007,
	"num_tokens": 2760088.0,
	"reward": 1.0593750476837158,
	"reward_std": 0.036339618265628815,
	"rewards/oai_reward_function/mean": 0.5296875014901161,
	"rewards/oai_reward_function/std": 0.04327215999364853,
	"step": 155
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10915260016918182,
	"epoch": 2.2285714285714286,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08330399543046951,
	"kl": 0.07353132590651512,
	"learning_rate": 3.892857142857143e-05,
	"loss": 0.0007,
	"num_tokens": 2777936.0,
	"reward": 1.25,
	"reward_std": 0.046066030859947205,
	"rewards/oai_reward_function/mean": 0.625,
	"rewards/oai_reward_function/std": 0.1287345290184021,
	"step": 156
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08624540269374847,
	"epoch": 2.242857142857143,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.0943220779299736,
	"kl": 0.06203949544578791,
	"learning_rate": 3.885714285714286e-05,
	"loss": 0.0006,
	"num_tokens": 2795664.0,
	"reward": 1.024999976158142,
	"reward_std": 0.023145508021116257,
	"rewards/oai_reward_function/mean": 0.5125000001862645,
	"rewards/oai_reward_function/std": 0.02199706807732582,
	"step": 157
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09054199792444706,
	"epoch": 2.257142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07400333881378174,
	"kl": 0.04232563078403473,
	"learning_rate": 3.8785714285714285e-05,
	"loss": 0.0004,
	"num_tokens": 2813352.0,
	"reward": 1.0499999523162842,
	"reward_std": 0.0258774571120739,
	"rewards/oai_reward_function/mean": 0.5250000022351742,
	"rewards/oai_reward_function/std": 0.03810004144906998,
	"step": 158
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.10258343070745468,
	"epoch": 2.2714285714285714,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.09145762026309967,
	"kl": 0.07726636342704296,
	"learning_rate": 3.8714285714285715e-05,
	"loss": 0.0008,
	"num_tokens": 2831304.0,
	"reward": 1.0593750476837158,
	"reward_std": 0.03966484218835831,
	"rewards/oai_reward_function/mean": 0.5296875014901161,
	"rewards/oai_reward_function/std": 0.05057631433010101,
	"step": 159
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08886106871068478,
	"epoch": 2.2857142857142856,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.050053730607032776,
	"kl": 0.0593466404825449,
	"learning_rate": 3.8642857142857146e-05,
	"loss": 0.0006,
	"num_tokens": 2849216.0,
	"reward": 1.0031249523162842,
	"reward_std": 0.008838832378387451,
	"rewards/oai_reward_function/mean": 0.5015625000232831,
	"rewards/oai_reward_function/std": 0.008838835172355175,
	"step": 160
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08666450530290604,
	"epoch": 2.3,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09668877720832825,
	"kl": 0.037179723381996155,
	"learning_rate": 3.857142857142858e-05,
	"loss": 0.0004,
	"num_tokens": 2867016.0,
	"reward": 1.0671875476837158,
	"reward_std": 0.034592773765325546,
	"rewards/oai_reward_function/mean": 0.5335937514901161,
	"rewards/oai_reward_function/std": 0.038942355662584305,
	"step": 161
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08739319443702698,
	"epoch": 2.314285714285714,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05101403221487999,
	"kl": 0.016047978308051825,
	"learning_rate": 3.85e-05,
	"loss": 0.0002,
	"num_tokens": 2884784.0,
	"reward": 1.001562476158142,
	"reward_std": 0.004419416189193726,
	"rewards/oai_reward_function/mean": 0.5007812500116415,
	"rewards/oai_reward_function/std": 0.0044194175861775875,
	"step": 162
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07174593396484852,
	"epoch": 2.3285714285714287,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09460754692554474,
	"kl": 0.031096406280994415,
	"learning_rate": 3.842857142857143e-05,
	"loss": 0.0003,
	"num_tokens": 2902576.0,
	"reward": 1.0315624475479126,
	"reward_std": 0.03596320003271103,
	"rewards/oai_reward_function/mean": 0.5157812498509884,
	"rewards/oai_reward_function/std": 0.02667333371937275,
	"step": 163
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0965243298560381,
	"epoch": 2.342857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.10237058997154236,
	"kl": 0.025380919221788645,
	"learning_rate": 3.8357142857142855e-05,
	"loss": 0.0003,
	"num_tokens": 2920416.0,
	"reward": 1.071874976158142,
	"reward_std": 0.045641690492630005,
	"rewards/oai_reward_function/mean": 0.5359374992549419,
	"rewards/oai_reward_function/std": 0.04396548494696617,
	"step": 164
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0735629927366972,
	"epoch": 2.357142857142857,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.001382152666337788,
	"kl": 0.011448808014392853,
	"learning_rate": 3.8285714285714286e-05,
	"loss": 0.0001,
	"num_tokens": 2938224.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 165
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07639794796705246,
	"epoch": 2.3714285714285714,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05336588993668556,
	"kl": 0.0100309734698385,
	"learning_rate": 3.821428571428572e-05,
	"loss": 0.0001,
	"num_tokens": 2956000.0,
	"reward": 1.0187499523162842,
	"reward_std": 0.017677675932645798,
	"rewards/oai_reward_function/mean": 0.509375000372529,
	"rewards/oai_reward_function/std": 0.023546453565359116,
	"step": 166
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0797042902559042,
	"epoch": 2.3857142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09860816597938538,
	"kl": 0.02236688695847988,
	"learning_rate": 3.814285714285715e-05,
	"loss": 0.0002,
	"num_tokens": 2973728.0,
	"reward": 1.0421874523162842,
	"reward_std": 0.03380424156785011,
	"rewards/oai_reward_function/mean": 0.5210937485098839,
	"rewards/oai_reward_function/std": 0.03052588365972042,
	"step": 167
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.09247681871056557,
	"epoch": 2.4,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11807835847139359,
	"kl": 0.028510943986475468,
	"learning_rate": 3.807142857142857e-05,
	"loss": 0.0003,
	"num_tokens": 2991648.0,
	"reward": 1.0703125,
	"reward_std": 0.04926247149705887,
	"rewards/oai_reward_function/mean": 0.53515625,
	"rewards/oai_reward_function/std": 0.036400206387043,
	"step": 168
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0499194972217083,
	"epoch": 2.414285714285714,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.11116263270378113,
	"kl": 0.04615373630076647,
	"learning_rate": 3.8e-05,
	"loss": 0.0005,
	"num_tokens": 3009472.0,
	"reward": 1.131250023841858,
	"reward_std": 0.0681503415107727,
	"rewards/oai_reward_function/mean": 0.5656249970197678,
	"rewards/oai_reward_function/std": 0.06772513687610626,
	"step": 169
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07691787928342819,
	"epoch": 2.4285714285714284,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.043317168951034546,
	"kl": 0.026559457648545504,
	"learning_rate": 3.792857142857143e-05,
	"loss": 0.0003,
	"num_tokens": 3027312.0,
	"reward": 1.0812499523162842,
	"reward_std": 0.013363059610128403,
	"rewards/oai_reward_function/mean": 0.5406249985098839,
	"rewards/oai_reward_function/std": 0.04867187887430191,
	"step": 170
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08763985149562359,
	"epoch": 2.442857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11738862097263336,
	"kl": 0.028244417626410723,
	"learning_rate": 3.785714285714286e-05,
	"loss": 0.0003,
	"num_tokens": 3045248.0,
	"reward": 1.2421875,
	"reward_std": 0.038010139018297195,
	"rewards/oai_reward_function/mean": 0.62109375,
	"rewards/oai_reward_function/std": 0.17051976919174194,
	"step": 171
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07618978433310986,
	"epoch": 2.4571428571428573,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.06447312235832214,
	"kl": 0.009853521827608347,
	"learning_rate": 3.778571428571429e-05,
	"loss": 0.0001,
	"num_tokens": 3063104.0,
	"reward": 1.0125000476837158,
	"reward_std": 0.013363069854676723,
	"rewards/oai_reward_function/mean": 0.5062500000931323,
	"rewards/oai_reward_function/std": 0.016800537705421448,
	"step": 172
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05904076434671879,
	"epoch": 2.4714285714285715,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.08978112041950226,
	"kl": 0.03310586418956518,
	"learning_rate": 3.771428571428572e-05,
	"loss": 0.0003,
	"num_tokens": 3081032.0,
	"reward": 1.109375,
	"reward_std": 0.04799327254295349,
	"rewards/oai_reward_function/mean": 0.5546875,
	"rewards/oai_reward_function/std": 0.05903713405132294,
	"step": 173
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0563393235206604,
	"epoch": 2.4857142857142858,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11673219501972198,
	"kl": 0.01918662153184414,
	"learning_rate": 3.764285714285715e-05,
	"loss": 0.0002,
	"num_tokens": 3098880.0,
	"reward": 1.1156249046325684,
	"reward_std": 0.04642024636268616,
	"rewards/oai_reward_function/mean": 0.5578125007450581,
	"rewards/oai_reward_function/std": 0.04554221034049988,
	"step": 174
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.054440722800791264,
	"epoch": 2.5,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0016670229379087687,
	"kl": 0.011914134491235018,
	"learning_rate": 3.757142857142857e-05,
	"loss": 0.0001,
	"num_tokens": 3116528.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 175
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07713918946683407,
	"epoch": 2.5142857142857142,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09991607069969177,
	"kl": 0.014265456004068255,
	"learning_rate": 3.7500000000000003e-05,
	"loss": 0.0001,
	"num_tokens": 3134280.0,
	"reward": 1.0140624046325684,
	"reward_std": 0.026196977123618126,
	"rewards/oai_reward_function/mean": 0.5070312502793968,
	"rewards/oai_reward_function/std": 0.017079481855034828,
	"step": 176
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.060492053627967834,
	"epoch": 2.5285714285714285,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11897552013397217,
	"kl": 0.03578268736600876,
	"learning_rate": 3.742857142857143e-05,
	"loss": 0.0004,
	"num_tokens": 3152096.0,
	"reward": 1.09375,
	"reward_std": 0.05294632539153099,
	"rewards/oai_reward_function/mean": 0.546875,
	"rewards/oai_reward_function/std": 0.044336508959531784,
	"step": 177
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.062405264005064964,
	"epoch": 2.5428571428571427,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.09012026339769363,
	"kl": 0.017184360651299357,
	"learning_rate": 3.735714285714286e-05,
	"loss": 0.0002,
	"num_tokens": 3169776.0,
	"reward": 1.2531249523162842,
	"reward_std": 0.029978279024362564,
	"rewards/oai_reward_function/mean": 0.6265625059604645,
	"rewards/oai_reward_function/std": 0.17049944400787354,
	"step": 178
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06565988063812256,
	"epoch": 2.557142857142857,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.126982182264328,
	"kl": 0.038017953746020794,
	"learning_rate": 3.728571428571428e-05,
	"loss": 0.0004,
	"num_tokens": 3187712.0,
	"reward": 1.3125,
	"reward_std": 0.041240036487579346,
	"rewards/oai_reward_function/mean": 0.65625,
	"rewards/oai_reward_function/std": 0.1866512894630432,
	"step": 179
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.040435372851789,
	"epoch": 2.571428571428571,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07750152051448822,
	"kl": 0.03242550138384104,
	"learning_rate": 3.721428571428572e-05,
	"loss": 0.0003,
	"num_tokens": 3205592.0,
	"reward": 1.256250023841858,
	"reward_std": 0.025646153837442398,
	"rewards/oai_reward_function/mean": 0.628125011920929,
	"rewards/oai_reward_function/std": 0.18651622533798218,
	"step": 180
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.047216037288308144,
	"epoch": 2.585714285714286,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.12098561972379684,
	"kl": 0.026824071537703276,
	"learning_rate": 3.7142857142857143e-05,
	"loss": 0.0003,
	"num_tokens": 3223480.0,
	"reward": 1.0093750953674316,
	"reward_std": 0.19897010922431946,
	"rewards/oai_reward_function/mean": 0.5046875029802322,
	"rewards/oai_reward_function/std": 0.21528521552681923,
	"step": 181
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08010220341384411,
	"epoch": 2.6,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11290978640317917,
	"kl": 0.022963001858443022,
	"learning_rate": 3.7071428571428574e-05,
	"loss": 0.0002,
	"num_tokens": 3241304.0,
	"reward": 1.1468751430511475,
	"reward_std": 0.08732541650533676,
	"rewards/oai_reward_function/mean": 0.5734374970197678,
	"rewards/oai_reward_function/std": 0.11707756668329239,
	"step": 182
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06848571076989174,
	"epoch": 2.6142857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.10999053716659546,
	"kl": 0.024655529763549566,
	"learning_rate": 3.7e-05,
	"loss": 0.0002,
	"num_tokens": 3259048.0,
	"reward": 1.165624976158142,
	"reward_std": 0.0838727056980133,
	"rewards/oai_reward_function/mean": 0.5828125029802322,
	"rewards/oai_reward_function/std": 0.14652389287948608,
	"step": 183
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.049521847628057,
	"epoch": 2.6285714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.10953173786401749,
	"kl": 0.02982867369428277,
	"learning_rate": 3.692857142857143e-05,
	"loss": 0.0003,
	"num_tokens": 3276808.0,
	"reward": 1.171875,
	"reward_std": 0.061461035162210464,
	"rewards/oai_reward_function/mean": 0.5859375,
	"rewards/oai_reward_function/std": 0.1271488517522812,
	"step": 184
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07211006805300713,
	"epoch": 2.642857142857143,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.1074090376496315,
	"kl": 0.036498697474598885,
	"learning_rate": 3.685714285714286e-05,
	"loss": 0.0004,
	"num_tokens": 3294808.0,
	"reward": 1.162500023841858,
	"reward_std": 0.13342483341693878,
	"rewards/oai_reward_function/mean": 0.5812499970197678,
	"rewards/oai_reward_function/std": 0.1636282056570053,
	"step": 185
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.04203084297478199,
	"epoch": 2.657142857142857,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.07064300775527954,
	"kl": 0.02704466599971056,
	"learning_rate": 3.678571428571429e-05,
	"loss": 0.0003,
	"num_tokens": 3312640.0,
	"reward": 1.0750000476837158,
	"reward_std": 0.018898211419582367,
	"rewards/oai_reward_function/mean": 0.5375000014901161,
	"rewards/oai_reward_function/std": 0.06839166581630707,
	"step": 186
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.054328473284840584,
	"epoch": 2.6714285714285713,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.06820113956928253,
	"kl": 0.022003832273185253,
	"learning_rate": 3.671428571428572e-05,
	"loss": 0.0002,
	"num_tokens": 3330408.0,
	"reward": 1.1531250476837158,
	"reward_std": 0.0646936446428299,
	"rewards/oai_reward_function/mean": 0.5765625014901161,
	"rewards/oai_reward_function/std": 0.14809781312942505,
	"step": 187
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.051017552614212036,
	"epoch": 2.685714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.08293981850147247,
	"kl": 0.02239195117726922,
	"learning_rate": 3.6642857142857145e-05,
	"loss": 0.0002,
	"num_tokens": 3348064.0,
	"reward": 1.0109374523162842,
	"reward_std": 0.017358144745230675,
	"rewards/oai_reward_function/mean": 0.505468750372529,
	"rewards/oai_reward_function/std": 0.015206077136099339,
	"step": 188
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.049897488206624985,
	"epoch": 2.7,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09536179155111313,
	"kl": 0.05423136055469513,
	"learning_rate": 3.6571428571428576e-05,
	"loss": 0.0005,
	"num_tokens": 3365896.0,
	"reward": 1.1484375,
	"reward_std": 0.06608611345291138,
	"rewards/oai_reward_function/mean": 0.57421875,
	"rewards/oai_reward_function/std": 0.10268264263868332,
	"step": 189
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05249054729938507,
	"epoch": 2.7142857142857144,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.13304099440574646,
	"kl": 0.04763131029903889,
	"learning_rate": 3.65e-05,
	"loss": 0.0005,
	"num_tokens": 3383800.0,
	"reward": 1.0828125476837158,
	"reward_std": 0.04099529981613159,
	"rewards/oai_reward_function/mean": 0.5414062514901161,
	"rewards/oai_reward_function/std": 0.04943608492612839,
	"step": 190
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.062329867854714394,
	"epoch": 2.7285714285714286,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11173869669437408,
	"kl": 0.03640593169257045,
	"learning_rate": 3.642857142857143e-05,
	"loss": 0.0004,
	"num_tokens": 3401648.0,
	"reward": 1.0562500953674316,
	"reward_std": 0.023689784109592438,
	"rewards/oai_reward_function/mean": 0.5281250011175871,
	"rewards/oai_reward_function/std": 0.0274963341653347,
	"step": 191
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.058776866644620895,
	"epoch": 2.742857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.10382409393787384,
	"kl": 0.03305045561864972,
	"learning_rate": 3.6357142857142854e-05,
	"loss": 0.0003,
	"num_tokens": 3419408.0,
	"reward": 1.217187523841858,
	"reward_std": 0.02610759809613228,
	"rewards/oai_reward_function/mean": 0.6085937470197678,
	"rewards/oai_reward_function/std": 0.1844356507062912,
	"step": 192
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.04789746552705765,
	"epoch": 2.757142857142857,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.05706682801246643,
	"kl": 0.03994939010590315,
	"learning_rate": 3.628571428571429e-05,
	"loss": 0.0004,
	"num_tokens": 3437112.0,
	"reward": 1.015625,
	"reward_std": 0.01860060542821884,
	"rewards/oai_reward_function/mean": 0.5078125,
	"rewards/oai_reward_function/std": 0.022394467145204544,
	"step": 193
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.062057855539023876,
	"epoch": 2.7714285714285714,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.15330444276332855,
	"kl": 0.084334472194314,
	"learning_rate": 3.6214285714285716e-05,
	"loss": 0.0008,
	"num_tokens": 3454904.0,
	"reward": 1.470312476158142,
	"reward_std": 0.04739333689212799,
	"rewards/oai_reward_function/mean": 0.735156238079071,
	"rewards/oai_reward_function/std": 0.19775548577308655,
	"step": 194
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.04913834575563669,
	"epoch": 2.7857142857142856,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.11146184056997299,
	"kl": 0.06489744689315557,
	"learning_rate": 3.6142857142857146e-05,
	"loss": 0.0006,
	"num_tokens": 3472632.0,
	"reward": 1.0703125,
	"reward_std": 0.045694079250097275,
	"rewards/oai_reward_function/mean": 0.53515625,
	"rewards/oai_reward_function/std": 0.04438621550798416,
	"step": 195
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.057510885410010815,
	"epoch": 2.8,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.1315256953239441,
	"kl": 0.06008041184395552,
	"learning_rate": 3.607142857142857e-05,
	"loss": 0.0006,
	"num_tokens": 3490592.0,
	"reward": 1.060937523841858,
	"reward_std": 0.02575094997882843,
	"rewards/oai_reward_function/mean": 0.5304687507450581,
	"rewards/oai_reward_function/std": 0.04522986710071564,
	"step": 196
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05490284040570259,
	"epoch": 2.814285714285714,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1558970808982849,
	"kl": 0.0605736318975687,
	"learning_rate": 3.6e-05,
	"loss": 0.0006,
	"num_tokens": 3508408.0,
	"reward": 1.1359374523162842,
	"reward_std": 0.16093073785305023,
	"rewards/oai_reward_function/mean": 0.5679687559604645,
	"rewards/oai_reward_function/std": 0.12798601388931274,
	"step": 197
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05418549384921789,
	"epoch": 2.8285714285714287,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.060409124940633774,
	"kl": 0.07027391903102398,
	"learning_rate": 3.5928571428571425e-05,
	"loss": 0.0007,
	"num_tokens": 3526168.0,
	"reward": 1.0281250476837158,
	"reward_std": 0.008838837966322899,
	"rewards/oai_reward_function/mean": 0.514062499627471,
	"rewards/oai_reward_function/std": 0.026133574545383453,
	"step": 198
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.03633992746472359,
	"epoch": 2.842857142857143,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.003255989169701934,
	"kl": 0.07370059937238693,
	"learning_rate": 3.585714285714286e-05,
	"loss": 0.0007,
	"num_tokens": 3543864.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/oai_reward_function/mean": 0.5,
	"rewards/oai_reward_function/std": 0.0,
	"step": 199
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.057329680770635605,
	"epoch": 2.857142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.11008545011281967,
	"kl": 0.06796468701213598,
	"learning_rate": 3.5785714285714286e-05,
	"loss": 0.0007,
	"num_tokens": 3561688.0,
	"reward": 1.25,
	"reward_std": 0.014625202864408493,
	"rewards/oai_reward_function/mean": 0.625,
	"rewards/oai_reward_function/std": 0.21655291318893433,
	"step": 200
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.04236162081360817,
	"epoch": 2.8714285714285714,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.15067800879478455,
	"kl": 0.10442700423300266,
	"learning_rate": 3.571428571428572e-05,
	"loss": 0.001,
	"num_tokens": 3579560.0,
	"reward": 1.2906250953674316,
	"reward_std": 0.06347659230232239,
	"rewards/oai_reward_function/mean": 0.6453125029802322,
	"rewards/oai_reward_function/std": 0.18144108355045319,
	"step": 201
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.04874769877642393,
	"epoch": 2.8857142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.08031316101551056,
	"kl": 0.06067081820219755,
	"learning_rate": 3.564285714285715e-05,
	"loss": 0.0006,
	"num_tokens": 3597200.0,
	"reward": 1.037500023841858,
	"reward_std": 0.019918914884328842,
	"rewards/oai_reward_function/mean": 0.5187500007450581,
	"rewards/oai_reward_function/std": 0.023759547621011734,
	"step": 202
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06220845878124237,
	"epoch": 2.9,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.12046143412590027,
	"kl": 0.05884059518575668,
	"learning_rate": 3.557142857142857e-05,
	"loss": 0.0006,
	"num_tokens": 3615112.0,
	"reward": 1.076562523841858,
	"reward_std": 0.05444490164518356,
	"rewards/oai_reward_function/mean": 0.5382812507450581,
	"rewards/oai_reward_function/std": 0.04835369065403938,
	"step": 203
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05830034799873829,
	"epoch": 2.914285714285714,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.09531212598085403,
	"kl": 0.05973371770232916,
	"learning_rate": 3.55e-05,
	"loss": 0.0006,
	"num_tokens": 3632936.0,
	"reward": 1.1078124046325684,
	"reward_std": 0.037323713302612305,
	"rewards/oai_reward_function/mean": 0.5539062544703484,
	"rewards/oai_reward_function/std": 0.07622901350259781,
	"step": 204
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05407467018812895,
	"epoch": 2.928571428571429,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.07925046980381012,
	"kl": 0.07386120036244392,
	"learning_rate": 3.5428571428571426e-05,
	"loss": 0.0007,
	"num_tokens": 3650760.0,
	"reward": 1.0140624046325684,
	"reward_std": 0.02122672274708748,
	"rewards/oai_reward_function/mean": 0.5070312502793968,
	"rewards/oai_reward_function/std": 0.017079481855034828,
	"step": 205
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06182014662772417,
	"epoch": 2.942857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11716300994157791,
	"kl": 0.06741901952773333,
	"learning_rate": 3.5357142857142864e-05,
	"loss": 0.0007,
	"num_tokens": 3668512.0,
	"reward": 1.0906250476837158,
	"reward_std": 0.055445872247219086,
	"rewards/oai_reward_function/mean": 0.5453125014901161,
	"rewards/oai_reward_function/std": 0.06968752294778824,
	"step": 206
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.059788716956973076,
	"epoch": 2.9571428571428573,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.13249847292900085,
	"kl": 0.08083864115178585,
	"learning_rate": 3.528571428571429e-05,
	"loss": 0.0008,
	"num_tokens": 3686296.0,
	"reward": 1.2609375715255737,
	"reward_std": 0.032799478620290756,
	"rewards/oai_reward_function/mean": 0.6304687410593033,
	"rewards/oai_reward_function/std": 0.16235841810703278,
	"step": 207
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06789828836917877,
	"epoch": 2.9714285714285715,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1304040104150772,
	"kl": 0.07522418349981308,
	"learning_rate": 3.521428571428572e-05,
	"loss": 0.0008,
	"num_tokens": 3704008.0,
	"reward": 1.2593750953674316,
	"reward_std": 0.05023520812392235,
	"rewards/oai_reward_function/mean": 0.6296875029802322,
	"rewards/oai_reward_function/std": 0.1668539047241211,
	"step": 208
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.04057574924081564,
	"epoch": 2.9857142857142858,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1246933788061142,
	"kl": 0.10525520890951157,
	"learning_rate": 3.514285714285714e-05,
	"loss": 0.0011,
	"num_tokens": 3721888.0,
	"reward": 1.2625000476837158,
	"reward_std": 0.03328196331858635,
	"rewards/oai_reward_function/mean": 0.6312500089406967,
	"rewards/oai_reward_function/std": 0.1866512894630432,
	"step": 209
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06538868602365255,
	"epoch": 3.0,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11987200379371643,
	"kl": 0.090326476842165,
	"learning_rate": 3.507142857142857e-05,
	"loss": 0.0009,
	"num_tokens": 3739752.0,
	"reward": 1.0437500476837158,
	"reward_std": 0.040318816900253296,
	"rewards/oai_reward_function/mean": 0.5218750014901161,
	"rewards/oai_reward_function/std": 0.03521248698234558,
	"step": 210
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06806700490415096,
	"epoch": 3.0142857142857142,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.08975692093372345,
	"kl": 0.08757120184600353,
	"learning_rate": 3.5e-05,
	"loss": 0.0009,
	"num_tokens": 3757520.0,
	"reward": 1.0203125476837158,
	"reward_std": 0.024814628064632416,
	"rewards/oai_reward_function/mean": 0.510156249627471,
	"rewards/oai_reward_function/std": 0.019938793033361435,
	"step": 211
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05651993863284588,
	"epoch": 3.0285714285714285,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.10675106197595596,
	"kl": 0.09534911066293716,
	"learning_rate": 3.4928571428571434e-05,
	"loss": 0.001,
	"num_tokens": 3775296.0,
	"reward": 1.0750000476837158,
	"reward_std": 0.06767623126506805,
	"rewards/oai_reward_function/mean": 0.5375000014901161,
	"rewards/oai_reward_function/std": 0.07378040999174118,
	"step": 212
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0639553228393197,
	"epoch": 3.0428571428571427,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.09777996689081192,
	"kl": 0.07890664599835873,
	"learning_rate": 3.485714285714286e-05,
	"loss": 0.0008,
	"num_tokens": 3793032.0,
	"reward": 1.0281250476837158,
	"reward_std": 0.020751874893903732,
	"rewards/oai_reward_function/mean": 0.5140625005587935,
	"rewards/oai_reward_function/std": 0.021939707919955254,
	"step": 213
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06698552891612053,
	"epoch": 3.057142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.1115046888589859,
	"kl": 0.07001018989831209,
	"learning_rate": 3.478571428571429e-05,
	"loss": 0.0007,
	"num_tokens": 3810744.0,
	"reward": 1.056249976158142,
	"reward_std": 0.030470959842205048,
	"rewards/oai_reward_function/mean": 0.5281250011175871,
	"rewards/oai_reward_function/std": 0.04741290956735611,
	"step": 214
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06257231812924147,
	"epoch": 3.0714285714285716,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.10517910867929459,
	"kl": 0.07934985496103764,
	"learning_rate": 3.471428571428571e-05,
	"loss": 0.0008,
	"num_tokens": 3828496.0,
	"reward": 1.235937476158142,
	"reward_std": 0.012387894093990326,
	"rewards/oai_reward_function/mean": 0.6179687529802322,
	"rewards/oai_reward_function/std": 0.20793089270591736,
	"step": 215
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.055911571718752384,
	"epoch": 3.085714285714286,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.15163163840770721,
	"kl": 0.11663151904940605,
	"learning_rate": 3.4642857142857144e-05,
	"loss": 0.0012,
	"num_tokens": 3846408.0,
	"reward": 1.2062499523162842,
	"reward_std": 0.1645711362361908,
	"rewards/oai_reward_function/mean": 0.6031250059604645,
	"rewards/oai_reward_function/std": 0.10957211256027222,
	"step": 216
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05231211241334677,
	"epoch": 3.1,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.17118224501609802,
	"kl": 0.12115776538848877,
	"learning_rate": 3.4571428571428574e-05,
	"loss": 0.0012,
	"num_tokens": 3864168.0,
	"reward": 1.0859375,
	"reward_std": 0.1350831389427185,
	"rewards/oai_reward_function/mean": 0.5429687462747097,
	"rewards/oai_reward_function/std": 0.11469355970621109,
	"step": 217
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05661669000983238,
	"epoch": 3.1142857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.13312071561813354,
	"kl": 0.08971596322953701,
	"learning_rate": 3.45e-05,
	"loss": 0.0009,
	"num_tokens": 3882016.0,
	"reward": 1.2296874523162842,
	"reward_std": 0.02697797492146492,
	"rewards/oai_reward_function/mean": 0.6148437485098839,
	"rewards/oai_reward_function/std": 0.1935303658246994,
	"step": 218
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0506694195792079,
	"epoch": 3.1285714285714286,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.10720871388912201,
	"kl": 0.07849705778062344,
	"learning_rate": 3.442857142857143e-05,
	"loss": 0.0008,
	"num_tokens": 3899952.0,
	"reward": 1.1015625,
	"reward_std": 0.044115059077739716,
	"rewards/oai_reward_function/mean": 0.55078125,
	"rewards/oai_reward_function/std": 0.055534202605485916,
	"step": 219
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05330024380236864,
	"epoch": 3.142857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.12851352989673615,
	"kl": 0.0983127523213625,
	"learning_rate": 3.435714285714286e-05,
	"loss": 0.001,
	"num_tokens": 3917688.0,
	"reward": 1.365625023841858,
	"reward_std": 0.12765255570411682,
	"rewards/oai_reward_function/mean": 0.6828124970197678,
	"rewards/oai_reward_function/std": 0.20135001838207245,
	"step": 220
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05522188264876604,
	"epoch": 3.157142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.11083745956420898,
	"kl": 0.07289117947220802,
	"learning_rate": 3.428571428571429e-05,
	"loss": 0.0007,
	"num_tokens": 3935528.0,
	"reward": 1.046875,
	"reward_std": 0.0414334312081337,
	"rewards/oai_reward_function/mean": 0.5234375,
	"rewards/oai_reward_function/std": 0.039623990654945374,
	"step": 221
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05520590580999851,
	"epoch": 3.1714285714285713,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.11108041554689407,
	"kl": 0.08299623243510723,
	"learning_rate": 3.4214285714285714e-05,
	"loss": 0.0008,
	"num_tokens": 3953320.0,
	"reward": 1.2531250715255737,
	"reward_std": 0.017311176285147667,
	"rewards/oai_reward_function/mean": 0.6265624910593033,
	"rewards/oai_reward_function/std": 0.18985748291015625,
	"step": 222
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05928301624953747,
	"epoch": 3.185714285714286,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.1239665076136589,
	"kl": 0.11972067691385746,
	"learning_rate": 3.4142857142857145e-05,
	"loss": 0.0012,
	"num_tokens": 3971032.0,
	"reward": 1.1640625,
	"reward_std": 0.04833199828863144,
	"rewards/oai_reward_function/mean": 0.58203125,
	"rewards/oai_reward_function/std": 0.12608151137828827,
	"step": 223
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.042910450138151646,
	"epoch": 3.2,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1018366813659668,
	"kl": 0.08956374414265156,
	"learning_rate": 3.407142857142857e-05,
	"loss": 0.0009,
	"num_tokens": 3988864.0,
	"reward": 1.2765624523162842,
	"reward_std": 0.06503090262413025,
	"rewards/oai_reward_function/mean": 0.6382812410593033,
	"rewards/oai_reward_function/std": 0.19134333729743958,
	"step": 224
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06987146660685539,
	"epoch": 3.2142857142857144,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.13029474020004272,
	"kl": 0.11290079541504383,
	"learning_rate": 3.4000000000000007e-05,
	"loss": 0.0011,
	"num_tokens": 4006800.0,
	"reward": 1.3203125,
	"reward_std": 0.04008040949702263,
	"rewards/oai_reward_function/mean": 0.66015625,
	"rewards/oai_reward_function/std": 0.20209181308746338,
	"step": 225
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07145651057362556,
	"epoch": 3.2285714285714286,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1022149994969368,
	"kl": 0.05857388116419315,
	"learning_rate": 3.392857142857143e-05,
	"loss": 0.0006,
	"num_tokens": 4024680.0,
	"reward": 1.0499999523162842,
	"reward_std": 0.03877411410212517,
	"rewards/oai_reward_function/mean": 0.5249999985098839,
	"rewards/oai_reward_function/std": 0.0416397750377655,
	"step": 226
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.048385005444288254,
	"epoch": 3.242857142857143,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.12312141805887222,
	"kl": 0.07377888821065426,
	"learning_rate": 3.385714285714286e-05,
	"loss": 0.0007,
	"num_tokens": 4042472.0,
	"reward": 1.4500000476837158,
	"reward_std": 0.02340090088546276,
	"rewards/oai_reward_function/mean": 0.7249999940395355,
	"rewards/oai_reward_function/std": 0.23026981949806213,
	"step": 227
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0545792318880558,
	"epoch": 3.257142857142857,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.09947756677865982,
	"kl": 0.09583424963057041,
	"learning_rate": 3.3785714285714285e-05,
	"loss": 0.001,
	"num_tokens": 4060248.0,
	"reward": 1.0265624523162842,
	"reward_std": 0.10836321860551834,
	"rewards/oai_reward_function/mean": 0.5132812485098839,
	"rewards/oai_reward_function/std": 0.12380600348114967,
	"step": 228
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06396409310400486,
	"epoch": 3.2714285714285714,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.0686774030327797,
	"kl": 0.09425997547805309,
	"learning_rate": 3.3714285714285716e-05,
	"loss": 0.0009,
	"num_tokens": 4077880.0,
	"reward": 1.2703125476837158,
	"reward_std": 0.017598576843738556,
	"rewards/oai_reward_function/mean": 0.6351562440395355,
	"rewards/oai_reward_function/std": 0.2018921971321106,
	"step": 229
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08321201242506504,
	"epoch": 3.2857142857142856,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.06190980598330498,
	"kl": 0.03877187706530094,
	"learning_rate": 3.364285714285714e-05,
	"loss": 0.0004,
	"num_tokens": 4095672.0,
	"reward": 1.0046875476837158,
	"reward_std": 0.00646935636177659,
	"rewards/oai_reward_function/mean": 0.5023437500931323,
	"rewards/oai_reward_function/std": 0.007403614930808544,
	"step": 230
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.0575382262468338,
	"epoch": 3.3,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.11537446081638336,
	"kl": 0.08915554732084274,
	"learning_rate": 3.357142857142857e-05,
	"loss": 0.0009,
	"num_tokens": 4113480.0,
	"reward": 1.2359375953674316,
	"reward_std": 0.08257875591516495,
	"rewards/oai_reward_function/mean": 0.617968738079071,
	"rewards/oai_reward_function/std": 0.156090646982193,
	"step": 231
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05642772279679775,
	"epoch": 3.314285714285714,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.13337863981723785,
	"kl": 0.10237299278378487,
	"learning_rate": 3.35e-05,
	"loss": 0.001,
	"num_tokens": 4131224.0,
	"reward": 1.4734375476837158,
	"reward_std": 0.01958364248275757,
	"rewards/oai_reward_function/mean": 0.7367187440395355,
	"rewards/oai_reward_function/std": 0.2408807873725891,
	"step": 232
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06528778094798326,
	"epoch": 3.3285714285714287,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.13227899372577667,
	"kl": 0.11398253589868546,
	"learning_rate": 3.342857142857143e-05,
	"loss": 0.0011,
	"num_tokens": 4149184.0,
	"reward": 1.3203125,
	"reward_std": 0.03324369713664055,
	"rewards/oai_reward_function/mean": 0.6601562350988388,
	"rewards/oai_reward_function/std": 0.19673332571983337,
	"step": 233
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06553995609283447,
	"epoch": 3.342857142857143,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.20257574319839478,
	"kl": 0.12704718858003616,
	"learning_rate": 3.3357142857142856e-05,
	"loss": 0.0013,
	"num_tokens": 4167104.0,
	"reward": 1.4609375,
	"reward_std": 0.1858925223350525,
	"rewards/oai_reward_function/mean": 0.73046875,
	"rewards/oai_reward_function/std": 0.1834629327058792,
	"step": 234
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.04393093287944794,
	"epoch": 3.357142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.12950977683067322,
	"kl": 0.09361258894205093,
	"learning_rate": 3.3285714285714286e-05,
	"loss": 0.0009,
	"num_tokens": 4184944.0,
	"reward": 1.1656250953674316,
	"reward_std": 0.150077685713768,
	"rewards/oai_reward_function/mean": 0.5828124955296516,
	"rewards/oai_reward_function/std": 0.13220180571079254,
	"step": 235
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06838994100689888,
	"epoch": 3.3714285714285714,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.14525781571865082,
	"kl": 0.09100262448191643,
	"learning_rate": 3.321428571428572e-05,
	"loss": 0.0009,
	"num_tokens": 4202672.0,
	"reward": 1.4187500476837158,
	"reward_std": 0.02699536457657814,
	"rewards/oai_reward_function/mean": 0.7093749940395355,
	"rewards/oai_reward_function/std": 0.21884500980377197,
	"step": 236
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05996893718838692,
	"epoch": 3.3857142857142857,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.12374205142259598,
	"kl": 0.13878228701651096,
	"learning_rate": 3.314285714285714e-05,
	"loss": 0.0014,
	"num_tokens": 4220472.0,
	"reward": 1.443750023841858,
	"reward_std": 0.060242824256420135,
	"rewards/oai_reward_function/mean": 0.721875011920929,
	"rewards/oai_reward_function/std": 0.18898604810237885,
	"step": 237
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.04969180002808571,
	"epoch": 3.4,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.14094124734401703,
	"kl": 0.11433868668973446,
	"learning_rate": 3.307142857142858e-05,
	"loss": 0.0011,
	"num_tokens": 4238352.0,
	"reward": 1.5062499046325684,
	"reward_std": 0.04232252389192581,
	"rewards/oai_reward_function/mean": 0.7531249821186066,
	"rewards/oai_reward_function/std": 0.21019864082336426,
	"step": 238
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05842717830091715,
	"epoch": 3.414285714285714,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.13528573513031006,
	"kl": 0.13230286352336407,
	"learning_rate": 3.3e-05,
	"loss": 0.0013,
	"num_tokens": 4256072.0,
	"reward": 1.5250000953674316,
	"reward_std": 0.0736992210149765,
	"rewards/oai_reward_function/mean": 0.762499988079071,
	"rewards/oai_reward_function/std": 0.19999998807907104,
	"step": 239
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.053723374381661415,
	"epoch": 3.4285714285714284,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.10746068507432938,
	"kl": 0.0968917403370142,
	"learning_rate": 3.292857142857143e-05,
	"loss": 0.001,
	"num_tokens": 4273848.0,
	"reward": 1.0499999523162842,
	"reward_std": 0.017677675932645798,
	"rewards/oai_reward_function/mean": 0.525000000372529,
	"rewards/oai_reward_function/std": 0.028398092836141586,
	"step": 240
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.08724895678460598,
	"epoch": 3.442857142857143,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1673547476530075,
	"kl": 0.16037143021821976,
	"learning_rate": 3.285714285714286e-05,
	"loss": 0.0016,
	"num_tokens": 4291872.0,
	"reward": 1.603124976158142,
	"reward_std": 0.11311184614896774,
	"rewards/oai_reward_function/mean": 0.801562488079071,
	"rewards/oai_reward_function/std": 0.22014817595481873,
	"step": 241
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06301301345229149,
	"epoch": 3.4571428571428573,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.127055823802948,
	"kl": 0.10341309197247028,
	"learning_rate": 3.278571428571429e-05,
	"loss": 0.001,
	"num_tokens": 4309640.0,
	"reward": 1.265625,
	"reward_std": 0.02265283279120922,
	"rewards/oai_reward_function/mean": 0.6328125,
	"rewards/oai_reward_function/std": 0.19933734834194183,
	"step": 242
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.04968675132840872,
	"epoch": 3.4714285714285715,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1374424248933792,
	"kl": 0.1188412457704544,
	"learning_rate": 3.271428571428571e-05,
	"loss": 0.0012,
	"num_tokens": 4327360.0,
	"reward": 1.303125023841858,
	"reward_std": 0.10906177759170532,
	"rewards/oai_reward_function/mean": 0.651562511920929,
	"rewards/oai_reward_function/std": 0.2058555632829666,
	"step": 243
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06477249693125486,
	"epoch": 3.4857142857142858,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1306895911693573,
	"kl": 0.11430021747946739,
	"learning_rate": 3.264285714285714e-05,
	"loss": 0.0011,
	"num_tokens": 4345192.0,
	"reward": 1.4375,
	"reward_std": 0.1379069834947586,
	"rewards/oai_reward_function/mean": 0.71875,
	"rewards/oai_reward_function/std": 0.18447834253311157,
	"step": 244
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06405621953308582,
	"epoch": 3.5,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.1333678811788559,
	"kl": 0.17231638357043266,
	"learning_rate": 3.257142857142857e-05,
	"loss": 0.0017,
	"num_tokens": 4363064.0,
	"reward": 1.5109375715255737,
	"reward_std": 0.0335906445980072,
	"rewards/oai_reward_function/mean": 0.7554687559604645,
	"rewards/oai_reward_function/std": 0.22403325140476227,
	"step": 245
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.067863704636693,
	"epoch": 3.5142857142857142,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.13658057153224945,
	"kl": 0.17316893115639687,
	"learning_rate": 3.2500000000000004e-05,
	"loss": 0.0017,
	"num_tokens": 4380968.0,
	"reward": 1.5546875,
	"reward_std": 0.05777457728981972,
	"rewards/oai_reward_function/mean": 0.77734375,
	"rewards/oai_reward_function/std": 0.2121661901473999,
	"step": 246
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.05905670113861561,
	"epoch": 3.5285714285714285,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.09611335396766663,
	"kl": 0.050939660519361496,
	"learning_rate": 3.242857142857143e-05,
	"loss": 0.0005,
	"num_tokens": 4398608.0,
	"reward": 1.0125000476837158,
	"reward_std": 0.02314549870789051,
	"rewards/oai_reward_function/mean": 0.5062500000931323,
	"rewards/oai_reward_function/std": 0.016800537705421448,
	"step": 247
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07766996510326862,
	"epoch": 3.5428571428571427,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.13582761585712433,
	"kl": 0.12659209407866,
	"learning_rate": 3.235714285714286e-05,
	"loss": 0.0013,
	"num_tokens": 4416392.0,
	"reward": 1.5640625953674316,
	"reward_std": 0.07988262921571732,
	"rewards/oai_reward_function/mean": 0.782031238079071,
	"rewards/oai_reward_function/std": 0.2193496972322464,
	"step": 248
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.07497746869921684,
	"epoch": 3.557142857142857,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.1760389357805252,
	"kl": 0.24591631814837456,
	"learning_rate": 3.228571428571428e-05,
	"loss": 0.0025,
	"num_tokens": 4434272.0,
	"reward": 1.798437476158142,
	"reward_std": 0.12221544235944748,
	"rewards/oai_reward_function/mean": 0.899218738079071,
	"rewards/oai_reward_function/std": 0.10424157232046127,
	"step": 249
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 1.0,
	"completions/max_length": 512.0,
	"completions/max_terminated_length": 0.0,
	"completions/mean_length": 512.0,
	"completions/mean_terminated_length": 0.0,
	"completions/min_length": 512.0,
	"completions/min_terminated_length": 0.0,
	"entropy": 0.06046187411993742,
	"epoch": 3.571428571428571,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.15973122417926788,
	"kl": 0.22475793957710266,
	"learning_rate": 3.221428571428571e-05,
	"loss": 0.0022,
	"num_tokens": 4452104.0,
	"reward": 1.6359374523162842,
	"reward_std": 0.0966869369149208,
	"rewards/oai_reward_function/mean": 0.8179687559604645,
	"rewards/oai_reward_function/std": 0.19666926562786102,
	"step": 250
	}
	],
	"logging_steps": 1,
	"max_steps": 700,
	"num_input_tokens_seen": 4452104,
	"num_train_epochs": 10,
	"save_steps": 10,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 8,
	"trial_name": null,
	"trial_params": null
	}